diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -131,7 +131,8 @@ TypeScalarizeVector, // Replace this one-element vector with its element. TypeSplitVector, // Split this vector into two of half the size. TypeWidenVector, // This vector should be widened into a larger vector. - TypePromoteFloat // Replace this float with a larger one. + TypePromoteFloat, // Replace this float with a larger one. + TypeSoftPromoteHalf, // Soften half to i16 and use float to do arithmetic. }; /// LegalizeKind holds the legalization kind that needs to happen to EVT @@ -331,6 +332,12 @@ return TypePromoteInteger; } + // Return true if the half type should be passed around as i16, but promoted + // to float around arithmetic. The default behavior is to pass around as + // float and convert around loads/stores/bitcasts and other places where + // the size matters. + virtual bool softPromoteHalfType() const { return false; } + // There are two general methods for expanding a BUILD_VECTOR node: // 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle // them together. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2412,3 +2412,398 @@ } +//===----------------------------------------------------------------------===// +// Half Result Soft Promotion +//===----------------------------------------------------------------------===// + +void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { + LLVM_DEBUG(dbgs() << "Soft promote half result " << ResNo << ": "; + N->dump(&DAG); dbgs() << "\n"); + SDValue R = SDValue(); + + // See if the target wants to custom expand this node. + if (CustomLowerNode(N, N->getValueType(ResNo), true)) { + LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n"); + return; + } + + switch (N->getOpcode()) { + default: +#ifndef NDEBUG + dbgs() << "SoftPromoteHalfResult #" << ResNo << ": "; + N->dump(&DAG); dbgs() << "\n"; +#endif + llvm_unreachable("Do not know how to soft promote this operator's result!"); + + case ISD::BITCAST: R = SoftPromoteHalfRes_BITCAST(N); break; + case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break; + case ISD::EXTRACT_VECTOR_ELT: + R = SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(N); break; + case ISD::FCOPYSIGN: R = SoftPromoteHalfRes_FCOPYSIGN(N); break; + case ISD::FP_ROUND: R = SoftPromoteHalfRes_FP_ROUND(N); break; + + // Unary FP Operations + case ISD::FABS: + case ISD::FCBRT: + case ISD::FCEIL: + case ISD::FCOS: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FFLOOR: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FNEARBYINT: + case ISD::FNEG: + case ISD::FRINT: + case ISD::FROUND: + case ISD::FSIN: + case ISD::FSQRT: + case ISD::FTRUNC: + case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break; + + // Binary FP Operations + case ISD::FADD: + case ISD::FDIV: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: + case ISD::FMAXNUM: + case ISD::FMINNUM: + case ISD::FMUL: + case ISD::FPOW: + case ISD::FREM: + case ISD::FSUB: R = SoftPromoteHalfRes_BinOp(N); break; + + case ISD::FMA: // FMA is same as FMAD + case ISD::FMAD: R = SoftPromoteHalfRes_FMAD(N); break; + + case ISD::FPOWI: R = SoftPromoteHalfRes_FPOWI(N); break; + + case ISD::LOAD: R = SoftPromoteHalfRes_LOAD(N); break; + case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; + case ISD::SELECT_CC: R = SoftPromoteHalfRes_SELECT_CC(N); break; + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: R = SoftPromoteHalfRes_XINT_TO_FP(N); break; + case ISD::UNDEF: R = SoftPromoteHalfRes_UNDEF(N); break; + case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; + } + + if (R.getNode()) + SetSoftPromotedHalf(SDValue(N, ResNo), R); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) { + return BitConvertToInteger(N->getOperand(0)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ConstantFP(SDNode *N) { + ConstantFPSDNode *CN = cast(N); + + // Get the (bit-cast) APInt of the APFloat and build an integer constant + return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN), + MVT::i16); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N) { + SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), + NewOp.getValueType().getVectorElementType(), NewOp, + N->getOperand(1)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FCOPYSIGN(SDNode *N) { + SDValue LHS = GetSoftPromotedHalf(N->getOperand(0)); + SDValue RHS = BitConvertToInteger(N->getOperand(1)); + SDLoc dl(N); + + EVT LVT = LHS.getValueType(); + EVT RVT = RHS.getValueType(); + + unsigned LSize = LVT.getSizeInBits(); + unsigned RSize = RVT.getSizeInBits(); + + // First get the sign bit of second operand. + SDValue SignBit = DAG.getNode( + ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT), + DAG.getConstant(RSize - 1, dl, + TLI.getShiftAmountTy(RVT, DAG.getDataLayout()))); + SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit); + + // Shift right or sign-extend it if the two operands have different types. + int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits(); + if (SizeDiff > 0) { + SignBit = + DAG.getNode(ISD::SRL, dl, RVT, SignBit, + DAG.getConstant(SizeDiff, dl, + TLI.getShiftAmountTy(SignBit.getValueType(), + DAG.getDataLayout()))); + SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit); + } else if (SizeDiff < 0) { + SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit); + SignBit = + DAG.getNode(ISD::SHL, dl, LVT, SignBit, + DAG.getConstant(-SizeDiff, dl, + TLI.getShiftAmountTy(SignBit.getValueType(), + DAG.getDataLayout()))); + } + + // Clear the sign bit of the first operand. + SDValue Mask = DAG.getNode( + ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT), + DAG.getConstant(LSize - 1, dl, + TLI.getShiftAmountTy(LVT, DAG.getDataLayout()))); + Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT)); + LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask); + + // Or the value with the sign bit. + return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); + SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); + SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); + SDLoc dl(N); + + // Promote to the larger FP type. + Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + Op2 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op2); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2); + + // Convert back to FP16 as an integer. + return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); + SDValue Op1 = N->getOperand(1); + SDLoc dl(N); + + Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1); + + // Convert back to FP16 as an integer. + return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) { + return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) { + LoadSDNode *L = cast(N); + + // Load the value as an integer value with the same number of bits. + assert(L->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension!"); + SDValue NewL = + DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), MVT::i16, + SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(), + L->getPointerInfo(), MVT::i16, L->getAlignment(), + L->getMemOperand()->getFlags(), L->getAAInfo()); + // Legalize the chain result by replacing uses of the old value chain with the + // new one + ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); + return NewL; +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) { + SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); + SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); + return DAG.getSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1, + Op2); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) { + SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2)); + SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3)); + return DAG.getNode(ISD::SELECT_CC, SDLoc(N), Op2.getValueType(), + N->getOperand(0), N->getOperand(1), Op2, Op3, + N->getOperand(4)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDLoc dl(N); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); + + // Round the value to the softened type. + return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) { + return DAG.getUNDEF(MVT::i16); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); + SDLoc dl(N); + + // Promote to the larger FP type. + Op = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op); + + // Convert back to FP16 as an integer. + return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); + SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1)); + SDLoc dl(N); + + // Promote to the larger FP type. + Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1); + + // Convert back to FP16 as an integer. + return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res); +} + +//===----------------------------------------------------------------------===// +// Half Operand Soft Promotion +//===----------------------------------------------------------------------===// + +bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { + LLVM_DEBUG(dbgs() << "Soft promote half operand " << OpNo << ": "; + N->dump(&DAG); dbgs() << "\n"); + SDValue Res = SDValue(); + + if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) { + LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n"); + return false; + } + + // Nodes that use a promotion-requiring floating point operand, but doesn't + // produce a soft promotion-requiring floating point result, need to be + // legalized to use the soft promoted float operand. Nodes that produce at + // least one soft promotion-requiring floating point result have their + // operands legalized as a part of PromoteFloatResult. + switch (N->getOpcode()) { + default: + #ifndef NDEBUG + dbgs() << "SoftPromoteHalfOperand Op #" << OpNo << ": "; + N->dump(&DAG); dbgs() << "\n"; + #endif + llvm_unreachable("Do not know how to soft promote this operator's operand!"); + + case ISD::BITCAST: Res = SoftPromoteHalfOp_BITCAST(N); break; + case ISD::FCOPYSIGN: Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break; + case ISD::FP_EXTEND: Res = SoftPromoteHalfOp_FP_EXTEND(N); break; + case ISD::SELECT_CC: Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break; + case ISD::SETCC: Res = SoftPromoteHalfOp_SETCC(N); break; + case ISD::STORE: Res = SoftPromoteHalfOp_STORE(N, OpNo); break; + } + + if (!Res.getNode()) + return false; + + assert(Res.getNode() != N && "Expected a new node!"); + + assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && + "Invalid operand expansion"); + + ReplaceValueWith(SDValue(N, 0), Res); + return false; +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) { + SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); + + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, + unsigned OpNo) { + assert(OpNo == 1 && "Only Operand 1 must need promotion here"); + SDValue Op1 = N->getOperand(1); + SDLoc dl(N); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType()); + + Op1 = GetSoftPromotedHalf(Op1); + Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0), + Op1); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) { + SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); + return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) { + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); + + Op = GetSoftPromotedHalf(Op); + + SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op); + + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N, + unsigned OpNo) { + assert(OpNo == 0 && "Can only soften the comparison values"); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc dl(N); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType()); + + Op0 = GetSoftPromotedHalf(Op0); + Op1 = GetSoftPromotedHalf(Op1); + + // Promote to the larger FP type. + Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + + return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1, + N->getOperand(2), N->getOperand(3), N->getOperand(4)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SETCC(SDNode *N) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + ISD::CondCode CCCode = cast(N->getOperand(2))->get(); + SDLoc dl(N); + + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType()); + + Op0 = GetSoftPromotedHalf(Op0); + Op1 = GetSoftPromotedHalf(Op1); + + // Promote to the larger FP type. + Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0); + Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1); + + return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Can only soften the stored value!"); + StoreSDNode *ST = cast(N); + SDValue Val = ST->getValue(); + SDLoc dl(N); + + assert(!ST->isTruncatingStore() && "Unexpected truncating store."); + SDValue Promoted = GetSoftPromotedHalf(Val); + return DAG.getStore(ST->getChain(), dl, Promoted, ST->getBasePtr(), + ST->getMemOperand()); +} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -304,6 +304,9 @@ case TargetLowering::TypeSoftenFloat: // Promote the integer operand by hand. return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp)); + case TargetLowering::TypeSoftPromoteHalf: + // Promote the integer operand by hand. + return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftPromotedHalf(InOp)); case TargetLowering::TypePromoteFloat: { // Convert the promoted float by hand. if (!NOutVT.isVector()) @@ -2689,6 +2692,12 @@ if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) Op = GetPromotedFloat(Op); + if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) { + EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); + Op = GetSoftPromotedHalf(Op); + Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op); + } + RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!"); TargetLowering::MakeLibCallOptions CallOptions; @@ -2712,6 +2721,12 @@ if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) Op = GetPromotedFloat(Op); + if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) { + EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); + Op = GetSoftPromotedHalf(Op); + Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op); + } + RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!"); TargetLowering::MakeLibCallOptions CallOptions; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -109,6 +109,10 @@ /// supported precision, this map indicates what promoted value to use. SmallDenseMap PromotedFloats; + /// For floating-point nodes that have a smaller precision than the smallest + /// supported precision, this map indicates the converted value to use. + SmallDenseMap SoftPromotedHalfs; + /// For float nodes that need to be expanded this map indicates which operands /// are the expanded version of the input. SmallDenseMap, 8> ExpandedFloats; @@ -186,6 +190,7 @@ ExpandedIntegers.erase(OldId); SoftenedFloats.erase(OldId); PromotedFloats.erase(OldId); + SoftPromotedHalfs.erase(OldId); ExpandedFloats.erase(OldId); ScalarizedVectors.erase(OldId); SplitVectors.erase(OldId); @@ -651,6 +656,43 @@ SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo); + //===--------------------------------------------------------------------===// + // Half soft promotion support: LegalizeFloatTypes.cpp + //===--------------------------------------------------------------------===// + + SDValue GetSoftPromotedHalf(SDValue Op) { + TableId &PromotedId = SoftPromotedHalfs[getTableId(Op)]; + SDValue PromotedOp = getSDValue(PromotedId); + assert(PromotedOp.getNode() && "Operand wasn't promoted?"); + return PromotedOp; + } + void SetSoftPromotedHalf(SDValue Op, SDValue Result); + + void SoftPromoteHalfResult(SDNode *N, unsigned ResNo); + SDValue SoftPromoteHalfRes_BinOp(SDNode *N); + SDValue SoftPromoteHalfRes_BITCAST(SDNode *N); + SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N); + SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N); + SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N); + SDValue SoftPromoteHalfRes_FMAD(SDNode *N); + SDValue SoftPromoteHalfRes_FPOWI(SDNode *N); + SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N); + SDValue SoftPromoteHalfRes_LOAD(SDNode *N); + SDValue SoftPromoteHalfRes_SELECT(SDNode *N); + SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N); + SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); + SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); + SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); + + bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo); + SDValue SoftPromoteHalfOp_BITCAST(SDNode *N); + SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo); + SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N); + SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N); + SDValue SoftPromoteHalfOp_SETCC(SDNode *N); + SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo); + SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo); + //===--------------------------------------------------------------------===// // Scalarization Support: LegalizeVectorTypes.cpp //===--------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -124,6 +124,8 @@ Mapped |= 128; if (ResId && PromotedFloats.find(ResId) != PromotedFloats.end()) Mapped |= 256; + if (ResId && SoftPromotedHalfs.find(ResId) != SoftPromotedHalfs.end()) + Mapped |= 512; if (Node.getNodeId() != Processed) { // Since we allow ReplacedValues to map deleted nodes, it may map nodes @@ -168,6 +170,8 @@ dbgs() << " WidenedVectors"; if (Mapped & 256) dbgs() << " PromotedFloats"; + if (Mapped & 512) + dbgs() << " SoftPromoteHalfs"; dbgs() << "\n"; llvm_unreachable(nullptr); } @@ -276,6 +280,10 @@ PromoteFloatResult(N, i); Changed = true; goto NodeDone; + case TargetLowering::TypeSoftPromoteHalf: + SoftPromoteHalfResult(N, i); + Changed = true; + goto NodeDone; } } @@ -332,6 +340,10 @@ NeedsReanalyzing = PromoteFloatOperand(N, i); Changed = true; break; + case TargetLowering::TypeSoftPromoteHalf: + NeedsReanalyzing = SoftPromoteHalfOperand(N, i); + Changed = true; + break; } break; } @@ -719,6 +731,16 @@ OpIdEntry = getTableId(Result); } +void DAGTypeLegalizer::SetSoftPromotedHalf(SDValue Op, SDValue Result) { + assert(Result.getValueType() == MVT::i16 && + "Invalid type for soft-promoted half"); + AnalyzeNewValue(Result); + + auto &OpIdEntry = SoftPromotedHalfs[getTableId(Op)]; + assert((OpIdEntry == 0) && "Node is already promoted!"); + OpIdEntry = getTableId(Result); +} + void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) { // Note that in some cases vector operation operands may be greater than // the vector element type. For example BUILD_VECTOR of type <1 x i1> with diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -50,6 +50,7 @@ case TargetLowering::TypePromoteInteger: break; case TargetLowering::TypePromoteFloat: + case TargetLowering::TypeSoftPromoteHalf: llvm_unreachable("Bitcast of a promotion-needing float should never need" "expansion"); case TargetLowering::TypeSoftenFloat: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1025,6 +1025,7 @@ case TargetLowering::TypeLegal: case TargetLowering::TypePromoteInteger: case TargetLowering::TypePromoteFloat: + case TargetLowering::TypeSoftPromoteHalf: case TargetLowering::TypeSoftenFloat: case TargetLowering::TypeScalarizeVector: case TargetLowering::TypeWidenVector: @@ -3468,6 +3469,7 @@ } case TargetLowering::TypeSoftenFloat: case TargetLowering::TypePromoteFloat: + case TargetLowering::TypeSoftPromoteHalf: case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: case TargetLowering::TypeScalarizeVector: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -482,9 +482,14 @@ // Handle cases such as i8 -> <1 x i1> EVT ValueSVT = ValueVT.getVectorElementType(); - if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) - Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT) - : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); + if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) { + if (ValueSVT.getSizeInBits() == PartEVT.getSizeInBits()) + Val = DAG.getNode(ISD::BITCAST, DL, ValueSVT, Val); + else + Val = ValueVT.isFloatingPoint() + ? DAG.getFPExtendOrRound(Val, DL, ValueSVT) + : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); + } return DAG.getBuildVector(ValueVT, DL, Val); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -811,6 +811,7 @@ LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT); assert((LA == TypeLegal || LA == TypeSoftenFloat || + LA == TypeSoftPromoteHalf || (NVT.isVector() || ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) && "Promote may not follow Expand or Promote"); @@ -1229,10 +1230,18 @@ // promote it to f32, because there are no f16 library calls (except for // conversions). if (!isTypeLegal(MVT::f16)) { - NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32]; - RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32]; - TransformToType[MVT::f16] = MVT::f32; - ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat); + // Allow targets to control how we legalize half. + if (softPromoteHalfType()) { + NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16]; + RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16]; + TransformToType[MVT::f16] = MVT::f32; + ValueTypeActions.setTypeAction(MVT::f16, TypeSoftPromoteHalf); + } else { + NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32]; + RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32]; + TransformToType[MVT::f16] = MVT::f32; + ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat); + } } // Loop over all of the vector value types to see which need transformations. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1238,6 +1238,8 @@ /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool softPromoteHalfType() const override { return true; } + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -16,99 +16,17 @@ ; and their calling convention which remain unresolved.) define void @store_half(half* %fptr, half %v) { -; X86-SSE-LABEL: store_half: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: .cfi_offset %esi, -8 -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss %xmm0, (%esp) -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: calll __gnu_f2h_ieee -; X86-SSE-NEXT: movw %ax, (%esi) -; X86-SSE-NEXT: addl $8, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: store_half: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: subl $8, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 -; X86-AVX1-NEXT: .cfi_offset %esi, -8 -; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vmovss %xmm0, (%esp) -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX1-NEXT: calll __gnu_f2h_ieee -; X86-AVX1-NEXT: movw %ax, (%esi) -; X86-AVX1-NEXT: addl $8, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX1-NEXT: retl -; -; X86-AVX512-LABEL: store_half: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X86-AVX512-NEXT: vmovd %xmm0, %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: movw %ax, (%ecx) -; X86-AVX512-NEXT: retl -; -; X86-NOSSE-LABEL: store_half: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 -; X86-NOSSE-NEXT: subl $8, %esp -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16 -; X86-NOSSE-NEXT: .cfi_offset %esi, -8 -; X86-NOSSE-NEXT: flds {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: fstps (%esp) -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: calll __gnu_f2h_ieee -; X86-NOSSE-NEXT: movw %ax, (%esi) -; X86-NOSSE-NEXT: addl $8, %esp -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 -; X86-NOSSE-NEXT: popl %esi -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 -; X86-NOSSE-NEXT: retl -; -; X64-SSE-LABEL: store_half: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pushq %rbx -; X64-SSE-NEXT: .cfi_def_cfa_offset 16 -; X64-SSE-NEXT: .cfi_offset %rbx, -16 -; X64-SSE-NEXT: movq %rdi, %rbx -; X64-SSE-NEXT: callq __gnu_f2h_ieee -; X64-SSE-NEXT: movw %ax, (%rbx) -; X64-SSE-NEXT: popq %rbx -; X64-SSE-NEXT: .cfi_def_cfa_offset 8 -; X64-SSE-NEXT: retq +; X86-LABEL: store_half: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: retl ; -; X64-AVX1-LABEL: store_half: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rbx -; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX1-NEXT: .cfi_offset %rbx, -16 -; X64-AVX1-NEXT: movq %rdi, %rbx -; X64-AVX1-NEXT: callq __gnu_f2h_ieee -; X64-AVX1-NEXT: movw %ax, (%rbx) -; X64-AVX1-NEXT: popq %rbx -; X64-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X64-AVX1-NEXT: retq -; -; X64-AVX512-LABEL: store_half: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: movw %ax, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: store_half: +; X64: # %bb.0: +; X64-NEXT: movw %si, (%rdi) +; X64-NEXT: retq store atomic half %v, half* %fptr unordered, align 2 ret void } @@ -302,82 +220,16 @@ } define half @load_half(half* %fptr) { -; X86-SSE-LABEL: load_half: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movzwl (%eax), %eax -; X86-SSE-NEXT: movl %eax, (%esp) -; X86-SSE-NEXT: calll __gnu_h2f_ieee -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: load_half: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: subl $12, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movzwl (%eax), %eax -; X86-AVX1-NEXT: movl %eax, (%esp) -; X86-AVX1-NEXT: calll __gnu_h2f_ieee -; X86-AVX1-NEXT: addl $12, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX1-NEXT: retl -; -; X86-AVX512-LABEL: load_half: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: pushl %eax -; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movswl (%eax), %eax -; X86-AVX512-NEXT: vmovd %eax, %xmm0 -; X86-AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; X86-AVX512-NEXT: vmovss %xmm0, (%esp) -; X86-AVX512-NEXT: flds (%esp) -; X86-AVX512-NEXT: popl %eax -; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX512-NEXT: retl -; -; X86-NOSSE-LABEL: load_half: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: subl $12, %esp -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 16 -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movzwl (%eax), %eax -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: calll __gnu_h2f_ieee -; X86-NOSSE-NEXT: addl $12, %esp -; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 -; X86-NOSSE-NEXT: retl -; -; X64-SSE-LABEL: load_half: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pushq %rax -; X64-SSE-NEXT: .cfi_def_cfa_offset 16 -; X64-SSE-NEXT: movzwl (%rdi), %edi -; X64-SSE-NEXT: callq __gnu_h2f_ieee -; X64-SSE-NEXT: popq %rax -; X64-SSE-NEXT: .cfi_def_cfa_offset 8 -; X64-SSE-NEXT: retq +; X86-LABEL: load_half: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: retl ; -; X64-AVX1-LABEL: load_half: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rax -; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX1-NEXT: movzwl (%rdi), %edi -; X64-AVX1-NEXT: callq __gnu_h2f_ieee -; X64-AVX1-NEXT: popq %rax -; X64-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X64-AVX1-NEXT: retq -; -; X64-AVX512-LABEL: load_half: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movswl (%rdi), %eax -; X64-AVX512-NEXT: vmovd %eax, %xmm0 -; X64-AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-AVX512-NEXT: retq +; X64-LABEL: load_half: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: retq %v = load atomic half, half* %fptr unordered, align 2 ret half %v } diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -2266,96 +2266,100 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) { ; KNL-LABEL: test_concat_v2i1: ; KNL: ## %bb.0: -; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: movswl 2(%rdi), %eax ; KNL-NEXT: vmovd %eax, %xmm0 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 -; KNL-NEXT: movswl 2(%rdi), %eax -; KNL-NEXT: vmovd %eax, %xmm1 -; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 -; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; KNL-NEXT: vucomiss %xmm2, %xmm1 +; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL-NEXT: vucomiss %xmm1, %xmm0 ; KNL-NEXT: setb %al ; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: vucomiss %xmm2, %xmm0 +; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm2 +; KNL-NEXT: vcvtph2ps %xmm2, %xmm2 +; KNL-NEXT: vucomiss %xmm1, %xmm2 ; KNL-NEXT: setb %al ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k0, %k1, %k0 -; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vucomiss %xmm2, %xmm1 +; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vucomiss %xmm1, %xmm0 ; KNL-NEXT: seta %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: vucomiss %xmm2, %xmm0 +; KNL-NEXT: vucomiss %xmm1, %xmm2 ; KNL-NEXT: seta %al ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: korw %k1, %k2, %k1 -; KNL-NEXT: kandw %k1, %k0, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k2 -; KNL-NEXT: movswl (%rsi), %eax -; KNL-NEXT: vmovd %eax, %xmm0 -; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 -; KNL-NEXT: movswl 2(%rsi), %eax -; KNL-NEXT: vmovd %eax, %xmm1 -; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 -; KNL-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k2} {z} -; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} -; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; KNL-NEXT: vmovd %xmm0, %eax -; KNL-NEXT: movw %ax, (%rdx) -; KNL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; KNL-NEXT: vmovd %xmm0, %eax +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: xorl %eax, %eax +; KNL-NEXT: testb $1, %cl +; KNL-NEXT: movl $0, %ecx +; KNL-NEXT: je LBB85_2 +; KNL-NEXT: ## %bb.1: +; KNL-NEXT: movzwl (%rsi), %ecx +; KNL-NEXT: LBB85_2: +; KNL-NEXT: testb $1, %dil +; KNL-NEXT: je LBB85_4 +; KNL-NEXT: ## %bb.3: +; KNL-NEXT: movzwl 2(%rsi), %eax +; KNL-NEXT: LBB85_4: ; KNL-NEXT: movw %ax, 2(%rdx) +; KNL-NEXT: movw %cx, (%rdx) ; KNL-NEXT: retq ; ; SKX-LABEL: test_concat_v2i1: ; SKX: ## %bb.0: -; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: movswl 2(%rdi), %eax ; SKX-NEXT: vmovd %eax, %xmm0 ; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 -; SKX-NEXT: movswl 2(%rdi), %eax -; SKX-NEXT: vmovd %eax, %xmm1 -; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 -; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SKX-NEXT: vucomiss %xmm2, %xmm1 +; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX-NEXT: vucomiss %xmm1, %xmm0 ; SKX-NEXT: setb %al ; SKX-NEXT: kmovd %eax, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: vucomiss %xmm2, %xmm0 +; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: vmovd %eax, %xmm2 +; SKX-NEXT: vcvtph2ps %xmm2, %xmm2 +; SKX-NEXT: vucomiss %xmm1, %xmm2 ; SKX-NEXT: setb %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korw %k0, %k1, %k0 -; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vucomiss %xmm2, %xmm1 +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vucomiss %xmm1, %xmm0 ; SKX-NEXT: seta %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $1, %k1, %k1 -; SKX-NEXT: vucomiss %xmm2, %xmm0 +; SKX-NEXT: vucomiss %xmm1, %xmm2 ; SKX-NEXT: seta %al ; SKX-NEXT: kmovd %eax, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $7, %k2, %k2 ; SKX-NEXT: korw %k1, %k2, %k1 -; SKX-NEXT: kandw %k1, %k0, %k1 -; SKX-NEXT: kshiftrb $1, %k1, %k2 -; SKX-NEXT: movswl (%rsi), %eax -; SKX-NEXT: vmovd %eax, %xmm0 -; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 -; SKX-NEXT: movswl 2(%rsi), %eax -; SKX-NEXT: vmovd %eax, %xmm1 -; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 -; SKX-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k2} {z} -; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} -; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; SKX-NEXT: vmovd %xmm0, %eax -; SKX-NEXT: movw %ax, (%rdx) -; SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: kandw %k1, %k0, %k0 +; SKX-NEXT: kshiftrb $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kmovd %k0, %ecx +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: testb $1, %cl +; SKX-NEXT: movl $0, %ecx +; SKX-NEXT: je LBB85_2 +; SKX-NEXT: ## %bb.1: +; SKX-NEXT: movzwl (%rsi), %ecx +; SKX-NEXT: LBB85_2: +; SKX-NEXT: testb $1, %dil +; SKX-NEXT: je LBB85_4 +; SKX-NEXT: ## %bb.3: +; SKX-NEXT: movzwl 2(%rsi), %eax +; SKX-NEXT: LBB85_4: ; SKX-NEXT: movw %ax, 2(%rdx) +; SKX-NEXT: movw %cx, (%rdx) ; SKX-NEXT: retq %tmp = load <2 x half>, <2 x half>* %arg, align 8 %tmp3 = fcmp fast olt <2 x half> %tmp, diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll --- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -156,229 +156,203 @@ define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { ; CHECK-LABEL: test_mask_load_16xf16: ; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovmskb %xmm0, %ecx -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: vpmovmskb %xmm0, %r11d +; CHECK-NEXT: testb $1, %r11b ; CHECK-NEXT: je LBB12_1 ; CHECK-NEXT: ## %bb.2: ## %cond.load -; CHECK-NEXT: movswl (%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm8 +; CHECK-NEXT: movzwl (%rsi), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: jmp LBB12_3 ; CHECK-NEXT: LBB12_1: -; CHECK-NEXT: vxorps %xmm8, %xmm8, %xmm8 +; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: LBB12_3: ## %else -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vxorps %xmm9, %xmm9, %xmm9 -; CHECK-NEXT: testb $2, %cl +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: testb $2, %r11b ; CHECK-NEXT: je LBB12_4 ; CHECK-NEXT: ## %bb.5: ## %cond.load1 -; CHECK-NEXT: movswl 2(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vmovaps %xmm2, %xmm1 -; CHECK-NEXT: vmovaps %xmm2, %xmm7 -; CHECK-NEXT: vmovaps %xmm2, %xmm6 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm16 -; CHECK-NEXT: vmovaps %xmm2, %xmm15 -; CHECK-NEXT: vmovaps %xmm2, %xmm14 -; CHECK-NEXT: vmovaps %xmm2, %xmm13 -; CHECK-NEXT: vmovaps %xmm2, %xmm12 -; CHECK-NEXT: vmovaps %xmm2, %xmm11 -; CHECK-NEXT: vmovaps %xmm2, %xmm10 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2 -; CHECK-NEXT: testb $4, %cl +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movl %edi, %r12d +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %edi, %r13d +; CHECK-NEXT: movl %edi, %r14d +; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %edi, %r9d +; CHECK-NEXT: movl %edi, %r10d +; CHECK-NEXT: movl %edi, %r15d +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movzwl 2(%rsi), %edi +; CHECK-NEXT: ## kill: def $di killed $di def $edi +; CHECK-NEXT: testb $4, %r11b ; CHECK-NEXT: jne LBB12_7 ; CHECK-NEXT: jmp LBB12_8 ; CHECK-NEXT: LBB12_4: -; CHECK-NEXT: vmovaps %xmm2, %xmm1 -; CHECK-NEXT: vmovaps %xmm2, %xmm7 -; CHECK-NEXT: vmovaps %xmm2, %xmm6 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vmovaps %xmm2, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vmovaps %xmm2, %xmm16 -; CHECK-NEXT: vmovaps %xmm2, %xmm15 -; CHECK-NEXT: vmovaps %xmm2, %xmm14 -; CHECK-NEXT: vmovaps %xmm2, %xmm13 -; CHECK-NEXT: vmovaps %xmm2, %xmm12 -; CHECK-NEXT: vmovaps %xmm2, %xmm11 -; CHECK-NEXT: vmovaps %xmm2, %xmm10 -; CHECK-NEXT: testb $4, %cl +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movl %edi, %r12d +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %edi, %r13d +; CHECK-NEXT: movl %edi, %r14d +; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: movl %edi, %r9d +; CHECK-NEXT: movl %edi, %r10d +; CHECK-NEXT: movl %edi, %r15d +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: testb $4, %r11b ; CHECK-NEXT: je LBB12_8 ; CHECK-NEXT: LBB12_7: ## %cond.load4 -; CHECK-NEXT: movswl 4(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 +; CHECK-NEXT: movzwl 4(%rsi), %ecx +; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: LBB12_8: ## %else5 -; CHECK-NEXT: testb $8, %cl +; CHECK-NEXT: testb $8, %r11b ; CHECK-NEXT: jne LBB12_9 ; CHECK-NEXT: ## %bb.10: ## %else8 -; CHECK-NEXT: testb $16, %cl +; CHECK-NEXT: testb $16, %r11b ; CHECK-NEXT: jne LBB12_11 ; CHECK-NEXT: LBB12_12: ## %else11 -; CHECK-NEXT: testb $32, %cl +; CHECK-NEXT: testb $32, %r11b ; CHECK-NEXT: jne LBB12_13 ; CHECK-NEXT: LBB12_14: ## %else14 -; CHECK-NEXT: testb $64, %cl +; CHECK-NEXT: testb $64, %r11b ; CHECK-NEXT: jne LBB12_15 ; CHECK-NEXT: LBB12_16: ## %else17 -; CHECK-NEXT: testb $-128, %cl +; CHECK-NEXT: testb $-128, %r11b ; CHECK-NEXT: jne LBB12_17 ; CHECK-NEXT: LBB12_18: ## %else20 -; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 +; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 ; CHECK-NEXT: jne LBB12_19 ; CHECK-NEXT: LBB12_20: ## %else23 -; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 +; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 ; CHECK-NEXT: jne LBB12_21 ; CHECK-NEXT: LBB12_22: ## %else26 -; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 +; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 ; CHECK-NEXT: jne LBB12_23 ; CHECK-NEXT: LBB12_24: ## %else29 -; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 +; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 ; CHECK-NEXT: jne LBB12_25 ; CHECK-NEXT: LBB12_26: ## %else32 -; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 -; CHECK-NEXT: jne LBB12_27 +; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 +; CHECK-NEXT: je LBB12_28 +; CHECK-NEXT: LBB12_27: ## %cond.load34 +; CHECK-NEXT: movzwl 24(%rsi), %edx ; CHECK-NEXT: LBB12_28: ## %else35 -; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 +; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: testl $8192, %r11d ## imm = 0x2000 ; CHECK-NEXT: jne LBB12_29 -; CHECK-NEXT: LBB12_30: ## %else38 -; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 +; CHECK-NEXT: ## %bb.30: ## %else38 +; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 ; CHECK-NEXT: jne LBB12_31 ; CHECK-NEXT: LBB12_32: ## %else41 -; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 -; CHECK-NEXT: je LBB12_34 -; CHECK-NEXT: LBB12_33: ## %cond.load43 -; CHECK-NEXT: movswl 30(%rsi), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm9 -; CHECK-NEXT: LBB12_34: ## %else44 -; CHECK-NEXT: vcvtps2ph $4, %xmm8, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, (%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 2(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 4(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 6(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 8(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 10(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 12(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 14(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm16, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 16(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm15, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 18(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm14, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 20(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm13, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 22(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm12, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 24(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm11, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 26(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm10, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 28(%rax) -; CHECK-NEXT: vcvtps2ph $4, %xmm9, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 30(%rax) -; CHECK-NEXT: retq +; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 +; CHECK-NEXT: je LBB12_33 +; CHECK-NEXT: LBB12_34: ## %cond.load43 +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload +; CHECK-NEXT: movzwl 30(%rsi), %esi +; CHECK-NEXT: jmp LBB12_35 ; CHECK-NEXT: LBB12_9: ## %cond.load7 -; CHECK-NEXT: movswl 6(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7 -; CHECK-NEXT: testb $16, %cl +; CHECK-NEXT: movzwl 6(%rsi), %r12d +; CHECK-NEXT: testb $16, %r11b ; CHECK-NEXT: je LBB12_12 ; CHECK-NEXT: LBB12_11: ## %cond.load10 -; CHECK-NEXT: movswl 8(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6 -; CHECK-NEXT: testb $32, %cl +; CHECK-NEXT: movzwl 8(%rsi), %ebx +; CHECK-NEXT: testb $32, %r11b ; CHECK-NEXT: je LBB12_14 ; CHECK-NEXT: LBB12_13: ## %cond.load13 -; CHECK-NEXT: movswl 10(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5 -; CHECK-NEXT: testb $64, %cl +; CHECK-NEXT: movzwl 10(%rsi), %ebp +; CHECK-NEXT: testb $64, %r11b ; CHECK-NEXT: je LBB12_16 ; CHECK-NEXT: LBB12_15: ## %cond.load16 -; CHECK-NEXT: movswl 12(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4 -; CHECK-NEXT: testb $-128, %cl +; CHECK-NEXT: movzwl 12(%rsi), %r13d +; CHECK-NEXT: testb $-128, %r11b ; CHECK-NEXT: je LBB12_18 ; CHECK-NEXT: LBB12_17: ## %cond.load19 -; CHECK-NEXT: movswl 14(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3 -; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 +; CHECK-NEXT: movzwl 14(%rsi), %r14d +; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 ; CHECK-NEXT: je LBB12_20 ; CHECK-NEXT: LBB12_19: ## %cond.load22 -; CHECK-NEXT: movswl 16(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16 -; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 +; CHECK-NEXT: movzwl 16(%rsi), %r8d +; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 ; CHECK-NEXT: je LBB12_22 ; CHECK-NEXT: LBB12_21: ## %cond.load25 -; CHECK-NEXT: movswl 18(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15 -; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 +; CHECK-NEXT: movzwl 18(%rsi), %r9d +; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 ; CHECK-NEXT: je LBB12_24 ; CHECK-NEXT: LBB12_23: ## %cond.load28 -; CHECK-NEXT: movswl 20(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14 -; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 +; CHECK-NEXT: movzwl 20(%rsi), %r10d +; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 ; CHECK-NEXT: je LBB12_26 ; CHECK-NEXT: LBB12_25: ## %cond.load31 -; CHECK-NEXT: movswl 22(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13 -; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 -; CHECK-NEXT: je LBB12_28 -; CHECK-NEXT: LBB12_27: ## %cond.load34 -; CHECK-NEXT: movswl 24(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12 -; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 -; CHECK-NEXT: je LBB12_30 +; CHECK-NEXT: movzwl 22(%rsi), %r15d +; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 +; CHECK-NEXT: jne LBB12_27 +; CHECK-NEXT: jmp LBB12_28 ; CHECK-NEXT: LBB12_29: ## %cond.load37 -; CHECK-NEXT: movswl 26(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11 -; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 +; CHECK-NEXT: movzwl 26(%rsi), %ecx +; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 ; CHECK-NEXT: je LBB12_32 ; CHECK-NEXT: LBB12_31: ## %cond.load40 -; CHECK-NEXT: movswl 28(%rsi), %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10 -; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 -; CHECK-NEXT: jne LBB12_33 -; CHECK-NEXT: jmp LBB12_34 +; CHECK-NEXT: movzwl 28(%rsi), %ecx +; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 +; CHECK-NEXT: jne LBB12_34 +; CHECK-NEXT: LBB12_33: +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: LBB12_35: ## %else44 +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: movw %dx, (%rax) +; CHECK-NEXT: movw %di, 2(%rax) +; CHECK-NEXT: movw %cx, 4(%rax) +; CHECK-NEXT: movw %r12w, 6(%rax) +; CHECK-NEXT: movw %bx, 8(%rax) +; CHECK-NEXT: movw %bp, 10(%rax) +; CHECK-NEXT: movw %r13w, 12(%rax) +; CHECK-NEXT: movw %r14w, 14(%rax) +; CHECK-NEXT: movw %r8w, 16(%rax) +; CHECK-NEXT: movw %r9w, 18(%rax) +; CHECK-NEXT: movw %r10w, 20(%rax) +; CHECK-NEXT: movw %r15w, 22(%rax) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload +; CHECK-NEXT: movw %cx, 24(%rax) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload +; CHECK-NEXT: movw %cx, 26(%rax) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload +; CHECK-NEXT: movw %cx, 28(%rax) +; CHECK-NEXT: movw %si, 30(%rax) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer) ret <16 x half> %res } @@ -440,107 +414,77 @@ ; CHECK-NEXT: LBB13_32: ## %else30 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB13_1: ## %cond.store -; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: movw %si, (%rdi) ; CHECK-NEXT: testb $2, %al ; CHECK-NEXT: je LBB13_4 ; CHECK-NEXT: LBB13_3: ## %cond.store1 -; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 2(%rdi) +; CHECK-NEXT: movw %dx, 2(%rdi) ; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: je LBB13_6 ; CHECK-NEXT: LBB13_5: ## %cond.store3 -; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx ; CHECK-NEXT: movw %cx, 4(%rdi) ; CHECK-NEXT: testb $8, %al ; CHECK-NEXT: je LBB13_8 ; CHECK-NEXT: LBB13_7: ## %cond.store5 -; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 6(%rdi) +; CHECK-NEXT: movw %r8w, 6(%rdi) ; CHECK-NEXT: testb $16, %al ; CHECK-NEXT: je LBB13_10 ; CHECK-NEXT: LBB13_9: ## %cond.store7 -; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx -; CHECK-NEXT: movw %cx, 8(%rdi) +; CHECK-NEXT: movw %r9w, 8(%rdi) ; CHECK-NEXT: testb $32, %al ; CHECK-NEXT: je LBB13_12 ; CHECK-NEXT: LBB13_11: ## %cond.store9 -; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 10(%rdi) ; CHECK-NEXT: testb $64, %al ; CHECK-NEXT: je LBB13_14 ; CHECK-NEXT: LBB13_13: ## %cond.store11 -; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 12(%rdi) ; CHECK-NEXT: testb $-128, %al ; CHECK-NEXT: je LBB13_16 ; CHECK-NEXT: LBB13_15: ## %cond.store13 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 14(%rdi) ; CHECK-NEXT: testl $256, %eax ## imm = 0x100 ; CHECK-NEXT: je LBB13_18 ; CHECK-NEXT: LBB13_17: ## %cond.store15 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 16(%rdi) ; CHECK-NEXT: testl $512, %eax ## imm = 0x200 ; CHECK-NEXT: je LBB13_20 ; CHECK-NEXT: LBB13_19: ## %cond.store17 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 18(%rdi) ; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 ; CHECK-NEXT: je LBB13_22 ; CHECK-NEXT: LBB13_21: ## %cond.store19 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 20(%rdi) ; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 ; CHECK-NEXT: je LBB13_24 ; CHECK-NEXT: LBB13_23: ## %cond.store21 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 22(%rdi) ; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 ; CHECK-NEXT: je LBB13_26 ; CHECK-NEXT: LBB13_25: ## %cond.store23 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 24(%rdi) ; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 ; CHECK-NEXT: je LBB13_28 ; CHECK-NEXT: LBB13_27: ## %cond.store25 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 26(%rdi) ; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 ; CHECK-NEXT: je LBB13_30 ; CHECK-NEXT: LBB13_29: ## %cond.store27 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movw %cx, 28(%rdi) ; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 ; CHECK-NEXT: je LBB13_32 ; CHECK-NEXT: LBB13_31: ## %cond.store29 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movw %ax, 30(%rdi) ; CHECK-NEXT: retq call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1432,20 +1432,20 @@ ; KNL: ## %bb.0: ## %entry ; KNL-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] ; KNL-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; KNL-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9] +; KNL-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; KNL-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] +; KNL-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] +; KNL-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] +; KNL-NEXT: kmovw %edx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc2] +; KNL-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] ; KNL-NEXT: cwtl ## encoding: [0x98] ; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; KNL-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1] -; KNL-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] -; KNL-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9] -; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; KNL-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] -; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; KNL-NEXT: kmovw %ecx, %k0 ## encoding: [0xc5,0xf8,0x92,0xc1] -; KNL-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] -; KNL-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] +; KNL-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] ; KNL-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] ; KNL-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; KNL-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] @@ -1465,20 +1465,20 @@ ; AVX512BW: ## %bb.0: ## %entry ; AVX512BW-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] ; AVX512BW-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; AVX512BW-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9] +; AVX512BW-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX512BW-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] +; AVX512BW-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] +; AVX512BW-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] +; AVX512BW-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2] +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] ; AVX512BW-NEXT: cwtl ## encoding: [0x98] ; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] ; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; AVX512BW-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1] -; AVX512BW-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] -; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 ## encoding: [0xc4,0xe2,0x79,0x13,0xc9] -; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; AVX512BW-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] -; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; AVX512BW-NEXT: kmovd %ecx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc1] -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x01] -; AVX512BW-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] +; AVX512BW-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] ; AVX512BW-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] ; AVX512BW-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; AVX512BW-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] @@ -1497,20 +1497,20 @@ ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: movzwl (%rdi), %eax ## encoding: [0x0f,0xb7,0x07] ; SKX-NEXT: movzwl 2(%rdi), %ecx ## encoding: [0x0f,0xb7,0x4f,0x02] +; SKX-NEXT: movswl %cx, %ecx ## encoding: [0x0f,0xbf,0xc9] +; SKX-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] +; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; SKX-NEXT: setp %cl ## encoding: [0x0f,0x9a,0xc1] +; SKX-NEXT: setne %dl ## encoding: [0x0f,0x95,0xc2] +; SKX-NEXT: orb %cl, %dl ## encoding: [0x08,0xca] +; SKX-NEXT: kmovd %edx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc2] +; SKX-NEXT: kshiftlb $1, %k0, %k0 ## encoding: [0xc4,0xe3,0x79,0x32,0xc0,0x01] ; SKX-NEXT: cwtl ## encoding: [0x98] ; SKX-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] ; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] -; SKX-NEXT: movswl %cx, %eax ## encoding: [0x0f,0xbf,0xc1] -; SKX-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] -; SKX-NEXT: vcvtph2ps %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc9] -; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2] -; SKX-NEXT: vucomiss %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xca] -; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; SKX-NEXT: kmovd %ecx, %k0 ## encoding: [0xc5,0xfb,0x92,0xc1] -; SKX-NEXT: kshiftlb $1, %k0, %k0 ## encoding: [0xc4,0xe3,0x79,0x32,0xc0,0x01] -; SKX-NEXT: vucomiss %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc2] +; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] ; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] ; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -111,25 +111,28 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee ; X64-NEXT: mulss {{.*}}(%rip), %xmm0 +; X64-NEXT: callq __gnu_f2h_ieee +; X64-NEXT: movzwl %ax, %edi ; X64-NEXT: popq %rax ; X64-NEXT: .cfi_def_cfa_offset 8 -; X64-NEXT: retq +; X64-NEXT: jmp __gnu_h2f_ieee # TAILCALL ; ; X86-LABEL: div_arcp_by_const: ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: fmuls {{\.LCPI.*}} ; X86-NEXT: fstps (%esp) ; X86-NEXT: calll __gnu_f2h_ieee ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: fmuls {{\.LCPI.*}} ; X86-NEXT: popl %eax ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -382,66 +382,94 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend32_vec4: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $48, %rsp -; CHECK-LIBCALL-NEXT: movq %rdi, %rbx -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: subq $88, %rsp +; CHECK-LIBCALL-NEXT: movl (%rdi), %eax +; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx +; CHECK-LIBCALL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi +; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi +; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-LIBCALL-NEXT: addq $48, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-LIBCALL-NEXT: addq $88, %rsp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend32_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movswl 6(%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: movswl 4(%rdi), %eax +; BWON-F16C-NEXT: movl (%rdi), %eax +; BWON-F16C-NEXT: movl 4(%rdi), %ecx +; BWON-F16C-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; BWON-F16C-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 +; BWON-F16C-NEXT: vpextrw $1, %xmm1, %eax +; BWON-F16C-NEXT: cwtl +; BWON-F16C-NEXT: vmovd %eax, %xmm2 +; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; BWON-F16C-NEXT: vmovd %xmm1, %eax +; BWON-F16C-NEXT: cwtl ; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: cwtl ; BWON-F16C-NEXT: vmovd %eax, %xmm2 ; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: movswl 2(%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm3 -; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; BWON-F16C-NEXT: vpextrw $1, %xmm0, %eax +; BWON-F16C-NEXT: cwtl +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend32_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $56, %esp -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movzwl 2(%esi), %eax +; CHECK-I686-NEXT: subl $124, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movl (%eax), %ecx +; CHECK-I686-NEXT: movl 4(%eax), %eax +; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movzwl 4(%esi), %eax +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movzwl 6(%esi), %eax +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movzwl (%esi), %eax +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload @@ -457,8 +485,7 @@ ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-I686-NEXT: addl $56, %esp -; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: addl $124, %esp ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x float> @@ -468,92 +495,97 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend64_vec4: ; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbp +; CHECK-LIBCALL-NEXT: pushq %r14 ; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $16, %rsp -; CHECK-LIBCALL-NEXT: movq %rdi, %rbx -; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %edi +; CHECK-LIBCALL-NEXT: subq $32, %rsp +; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %r14d +; CHECK-LIBCALL-NEXT: movzwl 6(%rdi), %ebp +; CHECK-LIBCALL-NEXT: movzwl (%rdi), %ebx +; CHECK-LIBCALL-NEXT: movzwl 2(%rdi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movl %ebx, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movl %ebp, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movl %r14d, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1 -; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2 -; CHECK-LIBCALL-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1 -; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; CHECK-LIBCALL-NEXT: addq $16, %rsp +; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: addq $32, %rsp ; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: popq %r14 +; CHECK-LIBCALL-NEXT: popq %rbp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: movswl 6(%rdi), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: movswl 4(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm1 +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; BWON-F16C-NEXT: movswl 2(%rdi), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: movswl 4(%rdi), %eax +; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; BWON-F16C-NEXT: movswl (%rdi), %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm2 ; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: movswl 6(%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm3 -; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend64_vec4: ; CHECK-I686: # %bb.0: +; CHECK-I686-NEXT: pushl %ebx +; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $88, %esp -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movzwl 6(%esi), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movzwl 4(%esi), %eax +; CHECK-I686-NEXT: subl $64, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl 6(%eax), %esi +; CHECK-I686-NEXT: movzwl (%eax), %edi +; CHECK-I686-NEXT: movzwl 2(%eax), %ebx +; CHECK-I686-NEXT: movzwl 4(%eax), %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movzwl 2(%esi), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movl %ebx, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movzwl (%esi), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movl %edi, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: movl %esi, (%esp) ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; CHECK-I686-NEXT: addl $88, %esp +; CHECK-I686-NEXT: addl $64, %esp ; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: popl %edi +; CHECK-I686-NEXT: popl %ebx ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x double> @@ -843,9 +875,7 @@ ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: callq test_floatret ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: popq %rax +; CHECK-LIBCALL-NEXT: popq %rcx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_f80trunc_nodagcombine: @@ -853,8 +883,9 @@ ; BWON-F16C-NEXT: pushq %rax ; BWON-F16C-NEXT: callq test_floatret ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: popq %rax +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax +; BWON-F16C-NEXT: popq %rcx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_f80trunc_nodagcombine: @@ -863,9 +894,6 @@ ; CHECK-I686-NEXT: calll test_floatret ; CHECK-I686-NEXT: fstps (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movzwl %ax, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %1 = call float @test_floatret() @@ -881,54 +909,62 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: subq $16, %rsp -; CHECK-LIBCALL-NEXT: movl %edi, %ebx -; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: cvtsi2ss %ebx, %xmm0 +; CHECK-LIBCALL-NEXT: movzwl (%rsi), %ebx +; CHECK-LIBCALL-NEXT: cvtsi2ss %edi, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movzwl %ax, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-LIBCALL-NEXT: movl %ebx, %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movzwl %ax, %edi ; CHECK-LIBCALL-NEXT: addq $16, %rsp ; CHECK-LIBCALL-NEXT: popq %rbx -; CHECK-LIBCALL-NEXT: retq +; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL ; ; BWON-F16C-LABEL: test_sitofp_fadd_i32: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movswl (%rsi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: movswl (%rsi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sitofp_fadd_i32: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $28, %esp +; CHECK-I686-NEXT: pushl %edi +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $20, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movzwl (%eax), %edi +; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movl %edi, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: movzwl %si, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-I686-NEXT: xorps %xmm0, %xmm0 -; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee ; CHECK-I686-NEXT: movzwl %ax, %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0 -; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: addl $28, %esp +; CHECK-I686-NEXT: addl $20, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: popl %edi ; CHECK-I686-NEXT: retl %tmp0 = load half, half* %b %tmp1 = sitofp i32 %a to half @@ -941,58 +977,47 @@ ; CHECK-LIBCALL-LABEL: PR40273: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi +; CHECK-LIBCALL-NEXT: movzwl %di, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: xorl %eax, %eax ; CHECK-LIBCALL-NEXT: xorps %xmm1, %xmm1 ; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: jne .LBB17_3 -; CHECK-LIBCALL-NEXT: # %bb.1: -; CHECK-LIBCALL-NEXT: jp .LBB17_3 -; CHECK-LIBCALL-NEXT: # %bb.2: -; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: .LBB17_3: -; CHECK-LIBCALL-NEXT: popq %rax +; CHECK-LIBCALL-NEXT: movl $15360, %ecx # imm = 0x3C00 +; CHECK-LIBCALL-NEXT: cmovnel %ecx, %eax +; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax +; CHECK-LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-LIBCALL-NEXT: popq %rcx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: PR40273: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: movswl %di, %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: xorl %eax, %eax ; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 -; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; BWON-F16C-NEXT: jne .LBB17_3 -; BWON-F16C-NEXT: # %bb.1: -; BWON-F16C-NEXT: jp .LBB17_3 -; BWON-F16C-NEXT: # %bb.2: -; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: .LBB17_3: +; BWON-F16C-NEXT: movl $15360, %ecx # imm = 0x3C00 +; BWON-F16C-NEXT: cmovnel %ecx, %eax +; BWON-F16C-NEXT: cmovpl %ecx, %eax +; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: PR40273: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movzwl %ax, %eax +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: movl %eax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: xorl %eax, %eax ; CHECK-I686-NEXT: xorps %xmm1, %xmm1 ; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: jne .LBB17_3 -; CHECK-I686-NEXT: # %bb.1: -; CHECK-I686-NEXT: jp .LBB17_3 -; CHECK-I686-NEXT: # %bb.2: -; CHECK-I686-NEXT: xorps %xmm0, %xmm0 -; CHECK-I686-NEXT: .LBB17_3: -; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movl $15360, %ecx # imm = 0x3C00 +; CHECK-I686-NEXT: cmovnel %ecx, %eax +; CHECK-I686-NEXT: cmovpl %ecx, %eax +; CHECK-I686-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %2 = fcmp une half %0, 0xH0000 diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll --- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll +++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll @@ -17,7 +17,6 @@ define half @mxcsr_f16c(float %a) { ; CHECK: VCVTPS2PH{{.*}}mxcsr -; CHECK: VCVTPH2PS{{.*}}mxcsr %res = fptrunc float %a to half ret half %res } diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -6,55 +6,52 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v1f16: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: pushl %esi +; X86-NEXT: subl $12, %esp +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: addl $28, %esp +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v1f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rax -; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $16, %rsp +; X64-NEXT: movl %edi, %ebx +; X64-NEXT: movzwl %si, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill -; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movzwl %bx, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload -; X64-NEXT: popq %rax +; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X64-NEXT: callq __gnu_f2h_ieee +; X64-NEXT: addq $16, %rsp +; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v1f16: ; F16C: # %bb.0: -; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: movswl %si, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: movswl %di, %eax +; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: # kill: def $ax killed $ax killed $eax ; F16C-NEXT: retq %retval = fadd <1 x half> %arg0, %arg1 ret <1 x half> %retval @@ -63,99 +60,118 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v2f16: ; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzwl 8(%ebp), %esi +; X86-NEXT: movzwl 12(%ebp), %edi +; X86-NEXT: movzwl 20(%ebp), %ebx +; X86-NEXT: movzwl 16(%ebp), %eax ; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: movl %ebx, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; X86-NEXT: movl %edi, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill -; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: calll __gnu_f2h_ieee ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: flds {{[0-9]+}}(%esp) -; X86-NEXT: addl $64, %esp +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: pextrw $1, %xmm0, %edx +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: # kill: def $dx killed $dx killed $edx +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v2f16: ; X64: # %bb.0: -; X64-NEXT: subq $24, %rsp -; X64-NEXT: movss %xmm2, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movaps %xmm3, %xmm0 -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $32, %rsp +; X64-NEXT: movl %edx, %ebx +; X64-NEXT: movl %esi, %ebp +; X64-NEXT: movl %edi, %r14d +; X64-NEXT: movzwl %cx, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movzwl %bp, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; X64-NEXT: movzwl %bx, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill -; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload -; X64-NEXT: # xmm0 = mem[0],zero,zero,zero -; X64-NEXT: callq __gnu_f2h_ieee -; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: movzwl %r14w, %edi ; X64-NEXT: callq __gnu_h2f_ieee -; X64-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload -; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload -; X64-NEXT: # xmm1 = mem[0],zero,zero,zero -; X64-NEXT: addss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload -; X64-NEXT: addq $24, %rsp +; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; X64-NEXT: callq __gnu_f2h_ieee +; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; X64-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: pextrw $1, %xmm0, %edx +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $dx killed $dx killed $edx +; X64-NEXT: addq $32, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: popq %rbp ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v2f16: ; F16C: # %bb.0: -; F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 -; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: movswl %cx, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: movswl %si, %eax +; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; F16C-NEXT: movswl %dx, %eax +; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 +; F16C-NEXT: movswl %di, %eax +; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpextrw $1, %xmm0, %edx +; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: # kill: def $dx killed $dx killed $edx ; F16C-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/llvm/test/CodeGen/X86/pr38533.ll b/llvm/test/CodeGen/X86/pr38533.ll --- a/llvm/test/CodeGen/X86/pr38533.ll +++ b/llvm/test/CodeGen/X86/pr38533.ll @@ -14,22 +14,10 @@ ; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing. define void @pr38533_2(half %x) { -; SSE-LABEL: pr38533_2: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movw %ax, (%rax) -; SSE-NEXT: popq %rax -; SSE-NEXT: .cfi_def_cfa_offset 8 -; SSE-NEXT: retq -; -; AVX512-LABEL: pr38533_2: -; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, (%rax) -; AVX512-NEXT: retq +; CHECK-LABEL: pr38533_2: +; CHECK: # %bb.0: +; CHECK-NEXT: movw %di, (%rax) +; CHECK-NEXT: retq %a = bitcast half %x to <4 x i4> store volatile <4 x i4> %a, <4 x i4>* undef ret void @@ -37,22 +25,10 @@ ; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized. define void @pr38533_3(half %x) { -; SSE-LABEL: pr38533_3: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movw %ax, (%rax) -; SSE-NEXT: popq %rax -; SSE-NEXT: .cfi_def_cfa_offset 8 -; SSE-NEXT: retq -; -; AVX512-LABEL: pr38533_3: -; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, (%rax) -; AVX512-NEXT: retq +; CHECK-LABEL: pr38533_3: +; CHECK: # %bb.0: +; CHECK-NEXT: movw %di, (%rax) +; CHECK-NEXT: retq %a = bitcast half %x to <16 x i1> store volatile <16 x i1> %a, <16 x i1>* undef ret void diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,22 +4,28 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %r8d -; CHECK-NEXT: movzwl 2(%rdi), %r9d +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: movzwl 2(%rdi), %ecx +; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzwl 6(%rdi), %r8d ; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movzwl 6(%rdi), %edi -; CHECK-NEXT: movzwl (%rsi), %r10d -; CHECK-NEXT: movzwl 2(%rsi), %ecx -; CHECK-NEXT: movzwl 4(%rsi), %eax -; CHECK-NEXT: movzwl 6(%rsi), %esi -; CHECK-NEXT: movw %si, 14(%rdx) -; CHECK-NEXT: movw %di, 12(%rdx) -; CHECK-NEXT: movw %ax, 10(%rdx) +; CHECK-NEXT: movq (%rsi), %rsi +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: pextrw $1, %xmm0, %r9d +; CHECK-NEXT: movd %xmm0, %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: pextrw $3, %xmm0, %eax +; CHECK-NEXT: pextrw $2, %xmm0, %edi ; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 6(%rdx) -; CHECK-NEXT: movw %r9w, 4(%rdx) +; CHECK-NEXT: movw %cx, 4(%rdx) +; CHECK-NEXT: movw %r8w, 12(%rdx) +; CHECK-NEXT: movw %si, (%rdx) +; CHECK-NEXT: movw %di, 10(%rdx) +; CHECK-NEXT: movw %ax, 14(%rdx) ; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r8w, (%rdx) +; CHECK-NEXT: movw %r9w, 6(%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2153,58 +2153,56 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %rbx ; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: movl %esi, %ebx +; SSE-NEXT: movzwl %di, %edi ; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill -; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: cvttss2si %xmm0, %ebp +; SSE-NEXT: movzwl %bx, %edi ; SSE-NEXT: callq __gnu_h2f_ieee ; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload -; SSE-NEXT: movd %ecx, %xmm0 -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: popq %rax +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f16_to_4i32: ; VEX: # %bb.0: +; VEX-NEXT: pushq %rbp +; VEX-NEXT: pushq %rbx ; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; VEX-NEXT: vmovaps %xmm1, %xmm0 -; VEX-NEXT: callq __gnu_f2h_ieee -; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: movl %esi, %ebx +; VEX-NEXT: movzwl %di, %edi ; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill -; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero -; VEX-NEXT: callq __gnu_f2h_ieee -; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: vcvttss2si %xmm0, %ebp +; VEX-NEXT: movzwl %bx, %edi ; VEX-NEXT: callq __gnu_h2f_ieee ; VEX-NEXT: vcvttss2si %xmm0, %eax -; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vmovd %eax, %xmm0 +; VEX-NEXT: vmovd %ebp, %xmm1 ; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: popq %rax +; VEX-NEXT: addq $8, %rsp +; VEX-NEXT: popq %rbx +; VEX-NEXT: popq %rbp ; VEX-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: movswl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvttss2si %xmm0, %eax -; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: movswl %si, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %ecx ; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -26,26 +26,25 @@ ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: movswl %ax, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: vmovd %edx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; ALL-NEXT: movswl %cx, %ecx ; ALL-NEXT: vmovd %ecx, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; ALL-NEXT: shrq $48, %rax ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x float> @@ -57,26 +56,25 @@ ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: movswl %ax, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: vmovd %edx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; ALL-NEXT: movswl %cx, %ecx ; ALL-NEXT: vmovd %ecx, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; ALL-NEXT: shrq $48, %rax ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -87,51 +85,49 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_8f32: ; ALL: # %bb.0: +; ALL-NEXT: vmovq %xmm0, %rax +; ALL-NEXT: movq %rax, %rcx +; ALL-NEXT: shrq $32, %rcx ; ALL-NEXT: vpextrq $1, %xmm0, %rdx -; ALL-NEXT: movq %rdx, %r8 -; ALL-NEXT: movq %rdx, %r10 -; ALL-NEXT: movswl %dx, %r9d -; ALL-NEXT: # kill: def $edx killed $edx killed $rdx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: shrq $32, %r8 -; ALL-NEXT: shrq $48, %r10 -; ALL-NEXT: vmovq %xmm0, %rdi -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: movq %rdi, %rsi -; ALL-NEXT: movswl %di, %ecx -; ALL-NEXT: # kill: def $edi killed $edi killed $rdi +; ALL-NEXT: movq %rdx, %rsi +; ALL-NEXT: shrq $32, %rsi +; ALL-NEXT: movswl %dx, %edi +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movl %edx, %edi ; ALL-NEXT: shrl $16, %edi -; ALL-NEXT: shrq $32, %rax -; ALL-NEXT: shrq $48, %rsi +; ALL-NEXT: movswl %di, %edi +; ALL-NEXT: vmovd %edi, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; ALL-NEXT: movswl %si, %esi -; ALL-NEXT: vmovd %esi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vmovd %esi, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl %di, %eax +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; ALL-NEXT: shrq $48, %rdx +; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: vmovd %edx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; ALL-NEXT: movswl %ax, %edx +; ALL-NEXT: vmovd %edx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: vmovd %edx, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; ALL-NEXT: movswl %cx, %ecx +; ALL-NEXT: vmovd %ecx, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; ALL-NEXT: shrq $48, %rax +; ALL-NEXT: cwtl ; ALL-NEXT: vmovd %eax, %xmm2 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %ecx, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: movswl %r10w, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: movswl %r8w, %eax -; ALL-NEXT: vmovd %eax, %xmm5 -; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 -; ALL-NEXT: movswl %dx, %eax -; ALL-NEXT: vmovd %eax, %xmm6 -; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 -; ALL-NEXT: vmovd %r9d, %xmm7 -; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 -; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> @@ -141,385 +137,277 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm8 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %r10 +; AVX1-NEXT: movq %r10, %r8 +; AVX1-NEXT: shrq $32, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %rdx +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: shrq $32, %r9 +; AVX1-NEXT: vmovq %xmm0, %rdi +; AVX1-NEXT: movq %rdi, %r11 +; AVX1-NEXT: shrq $32, %r11 +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: movq %rsi, %rax +; AVX1-NEXT: shrq $32, %rax +; AVX1-NEXT: movswl %si, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: movl %esi, %ecx +; AVX1-NEXT: shrl $16, %ecx ; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm9 -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax -; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm10 -; AVX1-NEXT: vpextrq $1, %xmm4, %rax -; AVX1-NEXT: vmovd %ecx, %xmm11 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm12 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm13 -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX1-NEXT: shrq $48, %rsi +; AVX1-NEXT: movswl %si, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-NEXT: movswl %di, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm14 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovd %ecx, %xmm15 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX1-NEXT: movswl %r11w, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: shrq $48, %rdi +; AVX1-NEXT: movswl %di, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movswl %dx, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm5 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm6 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm7 -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX1-NEXT: movswl %r9w, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: movswl %dx, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: movswl %r10w, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: movl %r10d, %eax +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm3 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX1-NEXT: movswl %r8w, %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX1-NEXT: shrq $48, %r10 +; AVX1-NEXT: movswl %r10w, %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm8 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r10 +; AVX2-NEXT: movq %r10, %r8 +; AVX2-NEXT: shrq $32, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %rdx +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: shrq $32, %r9 +; AVX2-NEXT: vmovq %xmm0, %rdi +; AVX2-NEXT: movq %rdi, %r11 +; AVX2-NEXT: shrq $32, %r11 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: movswl %si, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: shrl $16, %ecx ; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm9 -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm10 -; AVX2-NEXT: vpextrq $1, %xmm4, %rax -; AVX2-NEXT: vmovd %ecx, %xmm11 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm12 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm13 -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: movswl %si, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-NEXT: movswl %di, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm14 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovd %ecx, %xmm15 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX2-NEXT: movswl %r11w, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: shrq $48, %rdi +; AVX2-NEXT: movswl %di, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movswl %dx, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm5 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm6 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: movswl %cx, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm7 -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX2-NEXT: movswl %r9w, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: movswl %dx, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: movswl %r10w, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: movl %r10d, %eax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm3 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX2-NEXT: movswl %r8w, %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: shrq $48, %r10 +; AVX2-NEXT: movswl %r10w, %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_16i16_to_16f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm8 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm9 -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm11 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm12 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm13 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm14 -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm15 -; AVX512F-NEXT: vmovq %xmm10, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm2 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm3 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: movswl %ax, %ecx -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512F-NEXT: shrl $16, %eax -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm10, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm10 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm5 -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: shrq $32, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm6 -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm7 -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_16i16_to_16f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm8 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm9 -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm11 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm12 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm13 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm14 -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm15 -; AVX512VL-NEXT: vmovq %xmm10, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm16 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm17 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm18 -; AVX512VL-NEXT: movswl %ax, %ecx -; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm19 -; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax -; AVX512VL-NEXT: vmovd %ecx, %xmm10 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $48, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm20 -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm21 -; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $16, %ecx -; AVX512VL-NEXT: movswl %cx, %ecx -; AVX512VL-NEXT: vmovd %ecx, %xmm22 -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9 -; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11 -; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13 -; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16 -; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: cvt_16i16_to_16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq %xmm0, %r10 +; AVX512-NEXT: movq %r10, %r8 +; AVX512-NEXT: shrq $32, %r8 +; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %r9 +; AVX512-NEXT: shrq $32, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rdi +; AVX512-NEXT: movq %rdi, %r11 +; AVX512-NEXT: shrq $32, %r11 +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: movq %rsi, %rax +; AVX512-NEXT: shrq $32, %rax +; AVX512-NEXT: movswl %si, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movswl %cx, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512-NEXT: cwtl +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX512-NEXT: shrq $48, %rsi +; AVX512-NEXT: movswl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512-NEXT: movswl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: cwtl +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX512-NEXT: movswl %r11w, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: shrq $48, %rdi +; AVX512-NEXT: movswl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: movswl %dx, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: cwtl +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX512-NEXT: movswl %r9w, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512-NEXT: shrq $48, %rdx +; AVX512-NEXT: movswl %dx, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512-NEXT: movswl %r10w, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: cwtl +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX512-NEXT: movswl %r8w, %eax +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX512-NEXT: shrq $48, %r10 +; AVX512-NEXT: movswl %r10w, %eax +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %1 = bitcast <16 x i16> %a0 to <16 x half> %2 = fpext <16 x half> %1 to <16 x float> ret <16 x float> %2 @@ -545,20 +433,30 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: movswl 6(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 4(%rdi), %ecx +; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 +; ALL-NEXT: vpextrw $1, %xmm1, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: cwtl ; ALL-NEXT: vmovd %eax, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: cwtl ; ALL-NEXT: vmovd %eax, %xmm2 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: movswl 2(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; ALL-NEXT: vpextrw $1, %xmm0, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 @@ -572,26 +470,25 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: movswl %ax, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: movswl %dx, %edx +; ALL-NEXT: vmovd %edx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; ALL-NEXT: movswl %cx, %ecx ; ALL-NEXT: vmovd %ecx, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; ALL-NEXT: shrq $48, %rax ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %esi, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -603,37 +500,57 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_8f32: ; ALL: # %bb.0: -; ALL-NEXT: movswl 6(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl 4(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 4(%rdi), %ecx +; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: movl 12(%rdi), %eax +; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: movl 8(%rdi), %eax +; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 +; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 +; ALL-NEXT: vpextrw $1, %xmm3, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm4 +; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 +; ALL-NEXT: vmovd %xmm3, %eax +; ALL-NEXT: cwtl ; ALL-NEXT: vmovd %eax, %xmm3 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: movswl 14(%rdi), %eax +; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: cwtl ; ALL-NEXT: vmovd %eax, %xmm4 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: movswl 12(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm5 -; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 -; ALL-NEXT: movswl 8(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm6 -; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 -; ALL-NEXT: movswl 10(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm7 -; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 -; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; ALL-NEXT: vpextrw $1, %xmm2, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; ALL-NEXT: vpextrw $1, %xmm1, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; ALL-NEXT: vpextrw $1, %xmm0, %eax +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -644,268 +561,436 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: movswl 22(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8 -; AVX1-NEXT: movswl 20(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9 -; AVX1-NEXT: movswl 16(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10 -; AVX1-NEXT: movswl 18(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11 -; AVX1-NEXT: movswl 30(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12 -; AVX1-NEXT: movswl 28(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13 -; AVX1-NEXT: movswl 24(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14 -; AVX1-NEXT: movswl 26(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15 -; AVX1-NEXT: movswl 6(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 4(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: movswl 14(%rdi), %eax +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: movl 20(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl 16(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl 28(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl 24(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: movl 4(%rdi), %ecx +; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl 12(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl 8(%rdi), %eax +; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 +; AVX1-NEXT: vpextrw $1, %xmm7, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vmovd %xmm7, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm7 +; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] +; AVX1-NEXT: vmovd %xmm6, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm7 +; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] +; AVX1-NEXT: vpextrw $1, %xmm6, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm6 +; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] +; AVX1-NEXT: vpextrw $1, %xmm5, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm6 +; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX1-NEXT: vmovd %xmm5, %eax +; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm5 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl 12(%rdi), %eax +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm6 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: movswl 8(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: movswl 10(%rdi), %eax +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX1-NEXT: vpextrw $1, %xmm0, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpextrw $1, %xmm4, %eax +; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX1-NEXT: vmovd %xmm4, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] +; AVX1-NEXT: vmovd %xmm3, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX1-NEXT: vpextrw $1, %xmm3, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX1-NEXT: vpextrw $1, %xmm2, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vmovd %xmm2, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX1-NEXT: vmovd %xmm8, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX1-NEXT: vpextrw $1, %xmm8, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: popq %rax ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: movswl 22(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8 -; AVX2-NEXT: movswl 20(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9 -; AVX2-NEXT: movswl 16(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10 -; AVX2-NEXT: movswl 18(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11 -; AVX2-NEXT: movswl 30(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12 -; AVX2-NEXT: movswl 28(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13 -; AVX2-NEXT: movswl 24(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14 -; AVX2-NEXT: movswl 26(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15 -; AVX2-NEXT: movswl 6(%rdi), %eax +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: movl 20(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl 16(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl 28(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl 24(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: movl 4(%rdi), %ecx +; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl 12(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl 8(%rdi), %eax +; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 +; AVX2-NEXT: vpextrw $1, %xmm7, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vmovd %xmm7, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm7 +; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] +; AVX2-NEXT: vmovd %xmm6, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm7 +; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] +; AVX2-NEXT: vpextrw $1, %xmm6, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] +; AVX2-NEXT: vpextrw $1, %xmm5, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: vmovd %xmm5, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm5 +; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm6 +; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 4(%rdi), %eax +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpextrw $1, %xmm4, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vmovd %xmm4, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm4 +; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] +; AVX2-NEXT: vmovd %xmm3, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm4 +; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX2-NEXT: vpextrw $1, %xmm3, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX2-NEXT: vpextrw $1, %xmm2, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm2 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX2-NEXT: vmovd %xmm8, %eax +; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm3 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: movswl 14(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl 12(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: movswl 8(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: movswl 10(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vpextrw $1, %xmm8, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: popq %rax ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_cvt_16i16_to_16f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: movswl 6(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8 -; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9 -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11 -; AVX512F-NEXT: movswl 14(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12 -; AVX512F-NEXT: movswl 12(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13 -; AVX512F-NEXT: movswl 8(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14 -; AVX512F-NEXT: movswl 10(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15 -; AVX512F-NEXT: movswl 22(%rdi), %eax +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: movl 4(%rdi), %ecx +; AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 12(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 8(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 20(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 16(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 28(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl 24(%rdi), %eax +; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 +; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 +; AVX512F-NEXT: vpextrw $1, %xmm7, %eax +; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: movswl 20(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: movswl 16(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: movswl 18(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: movswl 30(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: movswl 28(%rdi), %eax +; AVX512F-NEXT: vmovd %xmm7, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] +; AVX512F-NEXT: vmovd %xmm6, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] +; AVX512F-NEXT: vpextrw $1, %xmm6, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] +; AVX512F-NEXT: vpextrw $1, %xmm5, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm5 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: movswl 24(%rdi), %eax +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm6 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: movswl 26(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX512F-NEXT: vpextrw $1, %xmm4, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 +; AVX512F-NEXT: vpextrw $1, %xmm3, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[2,3] +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrw $1, %xmm2, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] +; AVX512F-NEXT: vpextrw $1, %xmm1, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vpextrw $1, %xmm8, %eax +; AVX512F-NEXT: cwtl +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: popq %rax ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_16i16_to_16f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movswl 6(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8 -; AVX512VL-NEXT: movswl 4(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm9 -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm10 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm11 -; AVX512VL-NEXT: movswl 14(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm12 -; AVX512VL-NEXT: movswl 12(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm13 -; AVX512VL-NEXT: movswl 8(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm14 -; AVX512VL-NEXT: movswl 10(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm15 -; AVX512VL-NEXT: movswl 22(%rdi), %eax +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: movl (%rdi), %eax +; AVX512VL-NEXT: movl 4(%rdi), %ecx +; AVX512VL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 12(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 8(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 20(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 16(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 28(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl 24(%rdi), %eax +; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 +; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 +; AVX512VL-NEXT: vpextrw $1, %xmm7, %eax +; AVX512VL-NEXT: cwtl ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 20(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl 16(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl 18(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: movswl 30(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: movswl 28(%rdi), %eax +; AVX512VL-NEXT: vmovd %xmm7, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm7 +; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] +; AVX512VL-NEXT: vmovd %xmm6, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm7 +; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] +; AVX512VL-NEXT: vpextrw $1, %xmm6, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm6 +; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] +; AVX512VL-NEXT: vpextrw $1, %xmm5, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm6 +; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512VL-NEXT: vmovd %xmm5, %eax +; AVX512VL-NEXT: cwtl ; AVX512VL-NEXT: vmovd %eax, %xmm5 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: movswl 24(%rdi), %eax +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] +; AVX512VL-NEXT: vmovd %xmm4, %eax +; AVX512VL-NEXT: cwtl ; AVX512VL-NEXT: vmovd %eax, %xmm6 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: movswl 26(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX512VL-NEXT: vpextrw $1, %xmm4, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX512VL-NEXT: vpextrw $1, %xmm3, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512VL-NEXT: vmovd %xmm3, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX512VL-NEXT: vmovd %xmm2, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm4 +; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] +; AVX512VL-NEXT: vmovd %xmm8, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512VL-NEXT: vpextrw $1, %xmm8, %eax +; AVX512VL-NEXT: cwtl +; AVX512VL-NEXT: vmovd %eax, %xmm3 +; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: popq %rax ; AVX512VL-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a0 %2 = bitcast <16 x i16> %1 to <16 x half> @@ -936,14 +1021,14 @@ ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: movswl %ax, %ecx ; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vmovd %ecx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; ALL-NEXT: retq %1 = bitcast <2 x i16> %a0 to <2 x half> %2 = fpext <2 x half> %1 to <2 x double> @@ -955,29 +1040,30 @@ ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: shrq $48, %rax +; ALL-NEXT: # kill: def $eax killed $eax killed $rax +; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: shrq $48, %rdx ; ALL-NEXT: movswl %dx, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: movswl %cx, %ecx +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: vmovd %esi, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x double> @@ -990,14 +1076,14 @@ ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: movswl %ax, %ecx ; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vmovd %ecx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1010,29 +1096,30 @@ ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: shrq $48, %rax +; ALL-NEXT: # kill: def $eax killed $eax killed $rax +; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: shrq $48, %rdx ; ALL-NEXT: movswl %dx, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: movswl %cx, %ecx +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: vmovd %esi, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1043,165 +1130,171 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx ; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: movl %edx, %r10d +; AVX1-NEXT: movq %rdx, %r10 ; AVX1-NEXT: movswl %dx, %r8d -; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX1-NEXT: shrl $16, %edx ; AVX1-NEXT: shrq $32, %r9 -; AVX1-NEXT: shrl $16, %r10d -; AVX1-NEXT: vpextrq $1, %xmm0, %rdi +; AVX1-NEXT: shrq $48, %r10 +; AVX1-NEXT: vmovq %xmm0, %rdi ; AVX1-NEXT: movq %rdi, %rsi -; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: movswl %di, %ecx -; AVX1-NEXT: shrq $48, %rdi +; AVX1-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX1-NEXT: shrl $16, %edi ; AVX1-NEXT: shrq $32, %rsi -; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: shrq $48, %rax ; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: movswl %si, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: movswl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movswl %r10w, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r8d, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: movswl %r9w, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: movswl %dx, %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovd %r8d, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: movswl %dx, %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx ; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: movl %edx, %r10d +; AVX2-NEXT: movq %rdx, %r10 ; AVX2-NEXT: movswl %dx, %r8d -; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX2-NEXT: shrl $16, %edx ; AVX2-NEXT: shrq $32, %r9 -; AVX2-NEXT: shrl $16, %r10d -; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: shrq $48, %r10 +; AVX2-NEXT: vmovq %xmm0, %rdi ; AVX2-NEXT: movq %rdi, %rsi -; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: movswl %di, %ecx -; AVX2-NEXT: shrq $48, %rdi +; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX2-NEXT: shrl $16, %edi ; AVX2-NEXT: shrq $32, %rsi -; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: shrq $48, %rax ; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movswl %si, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: movswl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: movswl %r10w, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r8d, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: movswl %r9w, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: movswl %dx, %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovd %r8d, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: movswl %dx, %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: vmovq %xmm0, %rdx ; AVX512-NEXT: movq %rdx, %r9 -; AVX512-NEXT: movl %edx, %r10d +; AVX512-NEXT: movq %rdx, %r10 ; AVX512-NEXT: movswl %dx, %r8d -; AVX512-NEXT: shrq $48, %rdx +; AVX512-NEXT: # kill: def $edx killed $edx killed $rdx +; AVX512-NEXT: shrl $16, %edx ; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: shrl $16, %r10d -; AVX512-NEXT: vmovq %xmm0, %rdi +; AVX512-NEXT: shrq $48, %r10 +; AVX512-NEXT: vpextrq $1, %xmm0, %rdi ; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: movswl %di, %ecx -; AVX512-NEXT: shrq $48, %rdi +; AVX512-NEXT: # kill: def $edi killed $edi killed $rdi +; AVX512-NEXT: shrl $16, %edi ; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: shrq $48, %rax ; AVX512-NEXT: cwtl ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movswl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovd %ecx, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movswl %si, %eax +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: movswl %di, %eax ; AVX512-NEXT: vmovd %eax, %xmm2 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: movswl %di, %eax -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: movswl %r10w, %eax -; AVX512-NEXT: vmovd %eax, %xmm4 -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512-NEXT: vmovd %r8d, %xmm5 -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: movswl %r9w, %eax -; AVX512-NEXT: vmovd %eax, %xmm6 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vmovd %r8d, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: movswl %dx, %eax -; AVX512-NEXT: vmovd %eax, %xmm7 -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x double> @@ -1229,15 +1322,15 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_2i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: movswl 2(%rdi), %eax ; ALL-NEXT: vmovd %eax, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: movswl (%rdi), %eax ; ALL-NEXT: vmovd %eax, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: retq %1 = load <2 x i16>, <2 x i16>* %a0 %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1248,25 +1341,25 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: movswl 6(%rdi), %eax ; ALL-NEXT: vmovd %eax, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: movswl 2(%rdi), %eax ; ALL-NEXT: vmovd %eax, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: movswl (%rdi), %eax ; ALL-NEXT: vmovd %eax, %xmm2 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: movswl 6(%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1279,29 +1372,30 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movl %eax, %edx +; ALL-NEXT: movq %rax, %rdx ; ALL-NEXT: movswl %ax, %esi -; ALL-NEXT: shrq $48, %rax +; ALL-NEXT: # kill: def $eax killed $eax killed $rax +; ALL-NEXT: shrl $16, %eax ; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: shrq $48, %rdx ; ALL-NEXT: movswl %dx, %edx ; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: movswl %cx, %ecx +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: vmovd %esi, %xmm1 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movswl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; ALL-NEXT: cwtl -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -1313,129 +1407,129 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: movswl 8(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 -; AVX1-NEXT: movswl 10(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX1-NEXT: movswl 12(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 -; AVX1-NEXT: movswl 14(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4 -; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: movswl 6(%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl 4(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: movswl 6(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-NEXT: movswl 4(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movswl 2(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movswl 14(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: movswl 12(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: movswl 10(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: movswl 8(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: movswl 8(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 -; AVX2-NEXT: movswl 10(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX2-NEXT: movswl 12(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 -; AVX2-NEXT: movswl 14(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4 -; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: movswl 6(%rdi), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl 4(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: movswl 6(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX2-NEXT: movswl 4(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movswl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movswl 14(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: movswl 12(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: movswl 10(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: movswl 8(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm3 +; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: movswl (%rdi), %eax +; AVX512-NEXT: movswl 14(%rdi), %eax ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movswl 2(%rdi), %eax +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movswl 12(%rdi), %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movswl 4(%rdi), %eax +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: movswl 10(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: movswl 8(%rdi), %eax ; AVX512-NEXT: vmovd %eax, %xmm2 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: movswl 6(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: movswl 4(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: movswl 2(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: movswl (%rdi), %eax ; AVX512-NEXT: vmovd %eax, %xmm3 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: movswl 8(%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm4 -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512-NEXT: movswl 10(%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm5 -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512-NEXT: movswl 12(%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm6 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512-NEXT: movswl 14(%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm7 -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half>