Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -3663,6 +3663,12 @@ /// \returns True, if the expansion was successful, false otherwise bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand UINT(i64) to double(f64) conversion + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs. SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const; Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2369,30 +2369,6 @@ assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); // Code below here assumes !isSigned without checking again. - // Implementation of unsigned i64 to f64 following the algorithm in - // __floatundidf in compiler_rt. This implementation has the advantage - // of performing rounding correctly, both in the default rounding mode - // and in all alternate rounding modes. - // TODO: Generalize this for use with other types. - if (SrcVT == MVT::i64 && DestVT == MVT::f64) { - LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n"); - SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); - SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( - BitsToDouble(UINT64_C(0x4530000000100000)), dl, DestVT); - SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT); - SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT); - SDValue HiShift = DAG.getConstant(32, dl, ShiftVT); - - SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Op0, LoMask); - SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, HiShift); - SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52); - SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); - SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, LoOr); - SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DestVT, HiFlt, TwoP84PlusTwoP52); - return DAG.getNode(ISD::FADD, dl, DestVT, LoFlt, HiSub); - } - // TODO: Generalize this for use with other types. if (SrcVT == MVT::i64 && DestVT == MVT::f32) { LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n"); @@ -2921,8 +2897,13 @@ Results.push_back(Tmp1); break; } - case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) { + Results.push_back(Tmp1); + break; + } + LLVM_FALLTHROUGH + case ISD::SINT_TO_FP: Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP, Node->getOperand(0), Node->getValueType(0), dl); Results.push_back(Tmp1); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1022,6 +1022,11 @@ EVT VT = Op.getOperand(0).getValueType(); SDLoc DL(Op); + // Attempt to expand using TargetLowering. + SDValue Result; + if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG)) + return Result; + // Make sure that the SINT_TO_FP and SRL instructions are available. if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand || TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4137,6 +4137,48 @@ return true; } +bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, + SelectionDAG &DAG) const { + SDValue Src = Node->getOperand(0); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Node->getValueType(0); + + if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64) + return false; + + // Only expand vector types if we have the appropriate vector bit operations. + if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) || + !isOperationLegalOrCustom(ISD::FADD, DstVT) || + !isOperationLegalOrCustom(ISD::FSUB, DstVT) || + !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) || + !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT))) + return false; + + SDLoc dl(SDValue(Node, 0)); + EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout()); + + // Implementation of unsigned i64 to f64 following the algorithm in + // __floatundidf in compiler_rt. This implementation has the advantage + // of performing rounding correctly, both in the default rounding mode + // and in all alternate rounding modes. + SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); + SDValue TwoP84PlusTwoP52 = + DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT); + SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT); + SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT); + SDValue HiShift = DAG.getConstant(32, dl, ShiftVT); + + SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask); + SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift); + SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52); + SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); + SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); + SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); + SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); + return true; +} + SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); Index: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll +++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll @@ -484,32 +484,12 @@ define <8 x double> @ulto8f64(<8 x i64> %a) { ; NODQ-LABEL: ulto8f64: ; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto8f64: @@ -524,32 +504,12 @@ ; ; KNL_WIDEN-LABEL: ulto8f64: ; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %b @@ -558,58 +518,22 @@ define <16 x double> @ulto16f64(<16 x i64> %a) { ; NODQ-LABEL: ulto16f64: ; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] +; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 +; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 +; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 +; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 +; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 +; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 +; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto16f64: @@ -626,58 +550,22 @@ ; ; KNL_WIDEN-LABEL: ulto16f64: ; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] +; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; KNL_WIDEN-NEXT: vporq %zmm4, %zmm3, %zmm3 +; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; KNL_WIDEN-NEXT: vporq %zmm5, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm0, %zmm0 +; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm1, %zmm2 +; KNL_WIDEN-NEXT: vporq %zmm4, %zmm2, %zmm2 +; KNL_WIDEN-NEXT: vpsrlq $32, %zmm1, %zmm1 +; KNL_WIDEN-NEXT: vporq %zmm5, %zmm1, %zmm1 +; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm1, %zmm1 +; KNL_WIDEN-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b Index: llvm/trunk/test/CodeGen/X86/ftrunc.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/ftrunc.ll +++ llvm/trunk/test/CodeGen/X86/ftrunc.ll @@ -106,39 +106,34 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movapd %xmm1, %xmm3 -; SSE2-NEXT: subsd %xmm2, %xmm3 -; SSE2-NEXT: cvttsd2si %xmm3, %rax -; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: cvttsd2si %xmm1, %rdx -; SSE2-NEXT: ucomisd %xmm2, %xmm1 -; SSE2-NEXT: cmovaeq %rax, %rdx ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: subsd %xmm2, %xmm1 ; SSE2-NEXT: cvttsd2si %xmm1, %rax +; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: cvttsd2si %xmm0, %rdx +; SSE2-NEXT: ucomisd %xmm2, %xmm0 +; SSE2-NEXT: cmovaeq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movapd %xmm0, %xmm3 +; SSE2-NEXT: subsd %xmm2, %xmm3 +; SSE2-NEXT: cvttsd2si %xmm3, %rax ; SSE2-NEXT: xorq %rcx, %rax ; SSE2-NEXT: cvttsd2si %xmm0, %rcx ; SSE2-NEXT: ucomisd %xmm2, %xmm0 ; SSE2-NEXT: cmovaeq %rax, %rcx -; SSE2-NEXT: movq %rcx, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm3, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: addpd %xmm0, %xmm1 ; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v2f64: @@ -158,68 +153,62 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm1, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movapd %xmm3, %xmm4 -; SSE2-NEXT: subsd %xmm2, %xmm4 -; SSE2-NEXT: cvttsd2si %xmm4, %rcx -; SSE2-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: cvttsd2si %xmm3, %rax -; SSE2-NEXT: ucomisd %xmm2, %xmm3 -; SSE2-NEXT: cmovaeq %rcx, %rax -; SSE2-NEXT: movapd %xmm1, %xmm3 -; SSE2-NEXT: subsd %xmm2, %xmm3 -; SSE2-NEXT: cvttsd2si %xmm3, %rsi -; SSE2-NEXT: xorq %rdx, %rsi +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: subsd %xmm3, %xmm1 ; SSE2-NEXT: cvttsd2si %xmm1, %rcx -; SSE2-NEXT: ucomisd %xmm2, %xmm1 -; SSE2-NEXT: cmovaeq %rsi, %rcx -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: movapd %xmm1, %xmm3 -; SSE2-NEXT: subsd %xmm2, %xmm3 -; SSE2-NEXT: cvttsd2si %xmm3, %rsi -; SSE2-NEXT: xorq %rdx, %rsi -; SSE2-NEXT: cvttsd2si %xmm1, %rdi -; SSE2-NEXT: ucomisd %xmm2, %xmm1 -; SSE2-NEXT: cmovaeq %rsi, %rdi -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: subsd %xmm2, %xmm1 -; SSE2-NEXT: cvttsd2si %xmm1, %rsi -; SSE2-NEXT: xorq %rdx, %rsi -; SSE2-NEXT: cvttsd2si %xmm0, %rdx -; SSE2-NEXT: ucomisd %xmm2, %xmm0 -; SSE2-NEXT: cmovaeq %rsi, %rdx +; SSE2-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: cvttsd2si %xmm2, %rdx +; SSE2-NEXT: ucomisd %xmm3, %xmm2 +; SSE2-NEXT: cmovaeq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm3, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm4 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: movq %rcx, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm1 -; SSE2-NEXT: movq %rax, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: subpd %xmm3, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: movapd %xmm2, %xmm4 +; SSE2-NEXT: subsd %xmm3, %xmm4 +; SSE2-NEXT: cvttsd2si %xmm4, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: cvttsd2si %xmm2, %rdx +; SSE2-NEXT: ucomisd %xmm3, %xmm2 +; SSE2-NEXT: cmovaeq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: subsd %xmm3, %xmm2 +; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: cvttsd2si %xmm0, %rdx +; SSE2-NEXT: ucomisd %xmm3, %xmm0 +; SSE2-NEXT: cmovaeq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movapd %xmm0, %xmm4 +; SSE2-NEXT: subsd %xmm3, %xmm4 +; SSE2-NEXT: cvttsd2si %xmm4, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: cvttsd2si %xmm0, %rax +; SSE2-NEXT: ucomisd %xmm3, %xmm0 +; SSE2-NEXT: cmovaeq %rcx, %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm2 +; SSE2-NEXT: addpd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v4f64: Index: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll @@ -497,63 +497,67 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { ; SSE2-LABEL: uitofp_2i64_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm4, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: subpd %xmm4, %xmm3 -; SSE2-NEXT: movapd %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE2-NEXT: addpd %xmm3, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_2i64_to_2f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; SSE41-NEXT: subpd %xmm3, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE41-NEXT: subpd %xmm3, %xmm2 -; SSE41-NEXT: haddpd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i64_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_2i64_to_2f64: @@ -837,104 +841,96 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE2-LABEL: uitofp_4i64_to_4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm5, %xmm2 -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_4i64_to_4f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE41-NEXT: subpd %xmm4, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm3 -; SSE41-NEXT: haddpd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm1 -; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm3 -; SSE41-NEXT: haddpd %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_4i64_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 -; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 -; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i64_to_4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f64: @@ -3446,67 +3442,73 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; SSE2-LABEL: uitofp_load_2i64_to_2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm4, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 ; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: subpd %xmm4, %xmm3 -; SSE2-NEXT: movapd %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_2i64_to_2f64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; SSE41-NEXT: subpd %xmm3, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE41-NEXT: subpd %xmm3, %xmm2 -; SSE41-NEXT: haddpd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_2i64_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovapd (%rdi), %xmm0 -; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] -; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i64_to_2f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: @@ -3652,109 +3654,104 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; SSE2-LABEL: uitofp_load_4i64_to_4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] -; SSE2-NEXT: subpd %xmm5, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm2 -; SSE2-NEXT: movapd %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 ; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: subpd %xmm5, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_4i64_to_4f64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm0 ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE41-NEXT: subpd %xmm4, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm3 -; SSE41-NEXT: haddpd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm1 -; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE41-NEXT: subpd %xmm4, %xmm3 -; SSE41-NEXT: haddpd %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_4i64_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovapd (%rdi), %ymm0 -; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 -; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 -; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: