diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2460,12 +2460,19 @@ assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); // TODO: Generalize this for use with other types. - if ((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) { - LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32\n"); + if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) || + (SrcVT == MVT::i64 && DestVT == MVT::f64)) { + LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32/f64\n"); // For unsigned conversions, convert them to signed conversions using the // algorithm from the x86_64 __floatundisf in compiler_rt. That method // should be valid for i32->f32 as well. + // More generally this transform should be valid if there are 3 more bits + // in the integer type than the significand. Rounding uses the first bit + // after the width of the significand and the OR of all bits after that. So + // we need to be able to OR the shifted out bit into one of the bits that + // participate in the OR. + // TODO: This really should be implemented using a branch rather than a // select. We happen to get lucky and machinesink does the right // thing most of the time. This would be a good candidate for a diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6528,8 +6528,13 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, SDValue &Chain, SelectionDAG &DAG) const { - unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0; - SDValue Src = Node->getOperand(OpNo); + // This transform is not correct for converting 0 when rounding mode is set + // to round toward negative infinity which will produce -0.0. So disable under + // strictfp. + if (Node->isStrictFPOpcode()) + return false; + + SDValue Src = Node->getOperand(0); EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); @@ -6548,9 +6553,10 @@ EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout()); // Implementation of unsigned i64 to f64 following the algorithm in - // __floatundidf in compiler_rt. This implementation has the advantage - // of performing rounding correctly, both in the default rounding mode - // and in all alternate rounding modes. + // __floatundidf in compiler_rt. This implementation performs rounding + // correctly in all rounding modes with the exception of converting 0 + // when rounding toward negative infinity. In that case the fsub will produce + // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect. SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT); @@ -6564,18 +6570,9 @@ SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); SDValue LoFlt = DAG.getBitcast(DstVT, LoOr); SDValue HiFlt = DAG.getBitcast(DstVT, HiOr); - if (Node->isStrictFPOpcode()) { - SDValue HiSub = - DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other}, - {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52}); - Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other}, - {HiSub.getValue(1), LoFlt, HiSub}); - Chain = Result.getValue(1); - } else { - SDValue HiSub = - DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); - Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); - } + SDValue HiSub = + DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52); + Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub); return true; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19885,6 +19885,10 @@ /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0 + // when converting 0 when rounding toward negative infinity. Caller will + // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode. + assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!"); // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 @@ -19898,8 +19902,6 @@ #endif */ - bool IsStrict = Op->isStrictFPOpcode(); - unsigned OpNo = IsStrict ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -19921,7 +19923,7 @@ // Load the 64-bit value into an XMM register. SDValue XR1 = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad( MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); @@ -19932,35 +19934,19 @@ MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); - SDValue Sub; - SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - if (IsStrict) { - Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, - {Op.getOperand(0), XR2F, CLod1}); - Chain = Sub.getValue(1); - } else - Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (!IsStrict && Subtarget.hasSSE3() && + if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { - // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - if (IsStrict) { - Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, - {Chain, Shuffle, Sub}); - Chain = Result.getValue(1); - } else - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); - if (IsStrict) - return DAG.getMergeValues({Result, Chain}, dl); - return Result; } @@ -20286,11 +20272,14 @@ if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; - if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) + // The transform for i64->f64 isn't correct for 0 when rounding to negative + // infinity. It produces -0.0, so disable under strictfp. + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); - if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) + if (Subtarget.is64Bit() && SrcVT == MVT::i64 && + (DstVT == MVT::f32 || DstVT == MVT::f64)) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-08.ll @@ -22,7 +22,7 @@ ; Test i64->f64. define double @f2(i64 %i) #0 { ; CHECK-LABEL: f2: -; CHECK: ldgr +; CHECK: cdgbr ; CHECK: adbr ; CHECK: br %r14 %conv = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %i, diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -2492,38 +2492,54 @@ ; ; X86-SSE-LABEL: uifdl: ; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: subl $28, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; X86-SSE-NEXT: subpd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movapd %xmm0, %xmm1 -; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; X86-SSE-NEXT: addpd %xmm0, %xmm1 -; X86-SSE-NEXT: movlpd %xmm1, (%esp) +; X86-SSE-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: shrl $31, %eax +; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE-NEXT: wait +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) ; X86-SSE-NEXT: fldl (%esp) ; X86-SSE-NEXT: wait -; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: addl $28, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl ; ; SSE-LABEL: uifdl: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movq %rdi, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: subpd {{.*}}(%rip), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: movl %edi, %ecx +; SSE-NEXT: andl $1, %ecx +; SSE-NEXT: orq %rax, %rcx +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnsq %rdi, %rcx +; SSE-NEXT: cvtsi2sd %rcx, %xmm0 +; SSE-NEXT: jns .LBB48_2 +; SSE-NEXT: # %bb.1: +; SSE-NEXT: addsd %xmm0, %xmm0 +; SSE-NEXT: .LBB48_2: # %entry ; SSE-NEXT: retq ; ; AVX1-LABEL: uifdl: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: cmovnsq %rdi, %rcx +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-NEXT: jns .LBB48_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB48_2: # %entry ; AVX1-NEXT: retq ; ; AVX512-LABEL: uifdl: diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll @@ -1262,14 +1262,17 @@ ; SSE-X86-NEXT: movl %esp, %ebp ; SSE-X86-NEXT: .cfi_def_cfa_register %ebp ; SSE-X86-NEXT: andl $-8, %esp -; SSE-X86-NEXT: subl $8, %esp +; SSE-X86-NEXT: subl $24, %esp +; SSE-X86-NEXT: movl 12(%ebp), %eax ; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-X86-NEXT: subpd {{\.LCPI.*}}, %xmm0 -; SSE-X86-NEXT: movapd %xmm0, %xmm1 -; SSE-X86-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-X86-NEXT: addpd %xmm0, %xmm1 -; SSE-X86-NEXT: movlpd %xmm1, (%esp) +; SSE-X86-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: shrl $31, %eax +; SSE-X86-NEXT: fildll {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; SSE-X86-NEXT: wait +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movsd %xmm0, (%esp) ; SSE-X86-NEXT: fldl (%esp) ; SSE-X86-NEXT: wait ; SSE-X86-NEXT: movl %ebp, %esp @@ -1279,12 +1282,18 @@ ; ; SSE-X64-LABEL: uitofp_i64tof64: ; SSE-X64: # %bb.0: -; SSE-X64-NEXT: movq %rdi, %xmm1 -; SSE-X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-X64-NEXT: subpd {{.*}}(%rip), %xmm1 -; SSE-X64-NEXT: movapd %xmm1, %xmm0 -; SSE-X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-X64-NEXT: addpd %xmm1, %xmm0 +; SSE-X64-NEXT: movq %rdi, %rax +; SSE-X64-NEXT: shrq %rax +; SSE-X64-NEXT: movl %edi, %ecx +; SSE-X64-NEXT: andl $1, %ecx +; SSE-X64-NEXT: orq %rax, %rcx +; SSE-X64-NEXT: testq %rdi, %rdi +; SSE-X64-NEXT: cmovnsq %rdi, %rcx +; SSE-X64-NEXT: cvtsi2sd %rcx, %xmm0 +; SSE-X64-NEXT: jns .LBB18_2 +; SSE-X64-NEXT: # %bb.1: +; SSE-X64-NEXT: addsd %xmm0, %xmm0 +; SSE-X64-NEXT: .LBB18_2: ; SSE-X64-NEXT: retq ; ; AVX-X86-LABEL: uitofp_i64tof64: @@ -1295,13 +1304,17 @@ ; AVX-X86-NEXT: movl %esp, %ebp ; AVX-X86-NEXT: .cfi_def_cfa_register %ebp ; AVX-X86-NEXT: andl $-8, %esp -; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: subl $24, %esp +; AVX-X86-NEXT: movl 12(%ebp), %eax ; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX-X86-NEXT: vmovlpd %xmm0, (%esp) +; AVX-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-X86-NEXT: shrl $31, %eax +; AVX-X86-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX-X86-NEXT: wait +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait ; AVX-X86-NEXT: movl %ebp, %esp @@ -1311,11 +1324,18 @@ ; ; AVX1-X64-LABEL: uitofp_i64tof64: ; AVX1-X64: # %bb.0: -; AVX1-X64-NEXT: vmovq %rdi, %xmm0 -; AVX1-X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-X64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-X64-NEXT: movq %rdi, %rax +; AVX1-X64-NEXT: shrq %rax +; AVX1-X64-NEXT: movl %edi, %ecx +; AVX1-X64-NEXT: andl $1, %ecx +; AVX1-X64-NEXT: orq %rax, %rcx +; AVX1-X64-NEXT: testq %rdi, %rdi +; AVX1-X64-NEXT: cmovnsq %rdi, %rcx +; AVX1-X64-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-X64-NEXT: jns .LBB18_2 +; AVX1-X64-NEXT: # %bb.1: +; AVX1-X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-X64-NEXT: .LBB18_2: ; AVX1-X64-NEXT: retq ; ; AVX512-X64-LABEL: uitofp_i64tof64: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -1262,112 +1262,218 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; SSE-32-LABEL: uitofp_v2i64_v2f64: ; SSE-32: # %bb.0: -; SSE-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE-32-NEXT: pand %xmm0, %xmm1 -; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm1 -; SSE-32-NEXT: psrlq $32, %xmm0 -; SSE-32-NEXT: por {{\.LCPI.*}}, %xmm0 -; SSE-32-NEXT: subpd {{\.LCPI.*}}, %xmm0 -; SSE-32-NEXT: addpd %xmm1, %xmm0 +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $32, %esp +; SSE-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) +; SSE-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-32-NEXT: movd %xmm1, %eax +; SSE-32-NEXT: shrl $31, %eax +; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; SSE-32-NEXT: fstpl {{[0-9]+}}(%esp) +; SSE-32-NEXT: wait +; SSE-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-32-NEXT: movd %xmm0, %eax +; SSE-32-NEXT: shrl $31, %eax +; SSE-32-NEXT: fildll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; SSE-32-NEXT: fstpl (%esp) +; SSE-32-NEXT: wait +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: uitofp_v2i64_v2f64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE-64-NEXT: pand %xmm0, %xmm1 -; SSE-64-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-64-NEXT: psrlq $32, %xmm0 -; SSE-64-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-64-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE-64-NEXT: addpd %xmm1, %xmm0 +; SSE-64-NEXT: movdqa %xmm0, %xmm1 +; SSE-64-NEXT: movq %xmm0, %rax +; SSE-64-NEXT: movq %rax, %rcx +; SSE-64-NEXT: shrq %rcx +; SSE-64-NEXT: movl %eax, %edx +; SSE-64-NEXT: andl $1, %edx +; SSE-64-NEXT: orq %rcx, %rdx +; SSE-64-NEXT: testq %rax, %rax +; SSE-64-NEXT: cmovnsq %rax, %rdx +; SSE-64-NEXT: xorps %xmm0, %xmm0 +; SSE-64-NEXT: cvtsi2sd %rdx, %xmm0 +; SSE-64-NEXT: jns .LBB21_2 +; SSE-64-NEXT: # %bb.1: +; SSE-64-NEXT: addsd %xmm0, %xmm0 +; SSE-64-NEXT: .LBB21_2: +; SSE-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-64-NEXT: movq %xmm1, %rax +; SSE-64-NEXT: movq %rax, %rcx +; SSE-64-NEXT: shrq %rcx +; SSE-64-NEXT: movl %eax, %edx +; SSE-64-NEXT: andl $1, %edx +; SSE-64-NEXT: orq %rcx, %rdx +; SSE-64-NEXT: testq %rax, %rax +; SSE-64-NEXT: cmovnsq %rax, %rdx +; SSE-64-NEXT: xorps %xmm1, %xmm1 +; SSE-64-NEXT: cvtsi2sd %rdx, %xmm1 +; SSE-64-NEXT: jns .LBB21_4 +; SSE-64-NEXT: # %bb.3: +; SSE-64-NEXT: addsd %xmm1, %xmm1 +; SSE-64-NEXT: .LBB21_4: +; SSE-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; SSE41-32-LABEL: uitofp_v2i64_v2f64: ; SSE41-32: # %bb.0: -; SSE41-32-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; SSE41-32-NEXT: pand %xmm0, %xmm1 -; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm1 -; SSE41-32-NEXT: psrlq $32, %xmm0 -; SSE41-32-NEXT: por {{\.LCPI.*}}, %xmm0 -; SSE41-32-NEXT: subpd {{\.LCPI.*}}, %xmm0 -; SSE41-32-NEXT: addpd %xmm1, %xmm0 +; SSE41-32-NEXT: pushl %ebp +; SSE41-32-NEXT: .cfi_def_cfa_offset 8 +; SSE41-32-NEXT: .cfi_offset %ebp, -8 +; SSE41-32-NEXT: movl %esp, %ebp +; SSE41-32-NEXT: .cfi_def_cfa_register %ebp +; SSE41-32-NEXT: andl $-8, %esp +; SSE41-32-NEXT: subl $32, %esp +; SSE41-32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) +; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-32-NEXT: movq %xmm1, {{[0-9]+}}(%esp) +; SSE41-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-32-NEXT: movd %xmm1, %eax +; SSE41-32-NEXT: shrl $31, %eax +; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) +; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; SSE41-32-NEXT: fstpl {{[0-9]+}}(%esp) +; SSE41-32-NEXT: wait +; SSE41-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-32-NEXT: movd %xmm0, %eax +; SSE41-32-NEXT: shrl $31, %eax +; SSE41-32-NEXT: fildll {{[0-9]+}}(%esp) +; SSE41-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; SSE41-32-NEXT: fstpl (%esp) +; SSE41-32-NEXT: wait +; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-32-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE41-32-NEXT: movl %ebp, %esp +; SSE41-32-NEXT: popl %ebp +; SSE41-32-NEXT: .cfi_def_cfa %esp, 4 ; SSE41-32-NEXT: retl ; ; SSE41-64-LABEL: uitofp_v2i64_v2f64: ; SSE41-64: # %bb.0: -; SSE41-64-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE41-64-NEXT: pand %xmm0, %xmm1 -; SSE41-64-NEXT: por {{.*}}(%rip), %xmm1 -; SSE41-64-NEXT: psrlq $32, %xmm0 -; SSE41-64-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-64-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE41-64-NEXT: addpd %xmm1, %xmm0 +; SSE41-64-NEXT: movdqa %xmm0, %xmm1 +; SSE41-64-NEXT: movq %xmm0, %rax +; SSE41-64-NEXT: movq %rax, %rcx +; SSE41-64-NEXT: shrq %rcx +; SSE41-64-NEXT: movl %eax, %edx +; SSE41-64-NEXT: andl $1, %edx +; SSE41-64-NEXT: orq %rcx, %rdx +; SSE41-64-NEXT: testq %rax, %rax +; SSE41-64-NEXT: cmovnsq %rax, %rdx +; SSE41-64-NEXT: xorps %xmm0, %xmm0 +; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm0 +; SSE41-64-NEXT: jns .LBB21_2 +; SSE41-64-NEXT: # %bb.1: +; SSE41-64-NEXT: addsd %xmm0, %xmm0 +; SSE41-64-NEXT: .LBB21_2: +; SSE41-64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE41-64-NEXT: movq %xmm1, %rax +; SSE41-64-NEXT: movq %rax, %rcx +; SSE41-64-NEXT: shrq %rcx +; SSE41-64-NEXT: movl %eax, %edx +; SSE41-64-NEXT: andl $1, %edx +; SSE41-64-NEXT: orq %rcx, %rdx +; SSE41-64-NEXT: testq %rax, %rax +; SSE41-64-NEXT: cmovnsq %rax, %rdx +; SSE41-64-NEXT: xorps %xmm1, %xmm1 +; SSE41-64-NEXT: cvtsi2sd %rdx, %xmm1 +; SSE41-64-NEXT: jns .LBB21_4 +; SSE41-64-NEXT: # %bb.3: +; SSE41-64-NEXT: addsd %xmm1, %xmm1 +; SSE41-64-NEXT: .LBB21_4: +; SSE41-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-64-NEXT: retq ; -; AVX1-32-LABEL: uitofp_v2i64_v2f64: -; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1 -; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX1-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-32-NEXT: retl +; AVX-32-LABEL: uitofp_v2i64_v2f64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vextractps $3, %xmm0, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl (%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v2i64_v2f64: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-64-NEXT: movq %rax, %rcx +; AVX1-64-NEXT: shrq %rcx +; AVX1-64-NEXT: movl %eax, %edx +; AVX1-64-NEXT: andl $1, %edx +; AVX1-64-NEXT: orq %rcx, %rdx +; AVX1-64-NEXT: testq %rax, %rax +; AVX1-64-NEXT: cmovnsq %rax, %rdx +; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-64-NEXT: jns .LBB21_2 +; AVX1-64-NEXT: # %bb.1: +; AVX1-64-NEXT: vaddsd %xmm1, %xmm1, %xmm1 +; AVX1-64-NEXT: .LBB21_2: +; AVX1-64-NEXT: vmovq %xmm0, %rax +; AVX1-64-NEXT: movq %rax, %rcx +; AVX1-64-NEXT: shrq %rcx +; AVX1-64-NEXT: movl %eax, %edx +; AVX1-64-NEXT: andl $1, %edx +; AVX1-64-NEXT: orq %rcx, %rdx +; AVX1-64-NEXT: testq %rax, %rax +; AVX1-64-NEXT: cmovnsq %rax, %rdx +; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0 +; AVX1-64-NEXT: jns .LBB21_4 +; AVX1-64-NEXT: # %bb.3: +; AVX1-64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-64-NEXT: .LBB21_4: +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-64-NEXT: retq ; -; AVX512F-32-LABEL: uitofp_v2i64_v2f64: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512F-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512F-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512F-32-NEXT: retl -; ; AVX512F-64-LABEL: uitofp_v2i64_v2f64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512F-64-NEXT: vmovq %xmm0, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; -; AVX512VL-32-LABEL: uitofp_v2i64_v2f64: -; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512VL-32-NEXT: retl -; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-64-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vmovq %xmm0, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_v2i64_v2f64: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -748,106 +748,154 @@ } define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { -; AVX1-32-LABEL: uitofp_v4i64_v4f64: -; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-32-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-32-NEXT: vorps {{\.LCPI.*}}, %ymm1, %ymm1 -; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-32-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-32-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-32-NEXT: vorpd {{\.LCPI.*}}, %ymm0, %ymm0 -; AVX1-32-NEXT: vsubpd {{\.LCPI.*}}, %ymm0, %ymm0 -; AVX1-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX1-32-NEXT: retl +; AVX-32-LABEL: uitofp_v4i64_v4f64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $64, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl (%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vextractps $3, %xmm0, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vextractps $1, %xmm1, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vextractps $3, %xmm1, %eax +; AVX-32-NEXT: shrl $31, %eax +; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX-32-NEXT: wait +; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v4i64_v4f64: ; AVX1-64: # %bb.0: ; AVX1-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-64-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-64-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX1-64-NEXT: vmovq %xmm2, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vmovq %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-64-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vmovq %xmm2, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1-64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-64-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-64-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vmovq %xmm0, %rax +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-64-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-32-LABEL: uitofp_v4i64_v4f64: -; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpsrlq $32, %ymm0, %ymm1 -; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1 -; AVX2-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1 -; AVX2-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX2-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0 -; AVX2-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-32-NEXT: retl -; ; AVX2-64-LABEL: uitofp_v4i64_v4f64: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX2-64-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX2-64-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX2-64-NEXT: vmovq %xmm2, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vmovq %xmm1, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9] +; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vmovq %xmm2, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vmovq %xmm0, %rax +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-64-NEXT: retq ; -; AVX512F-32-LABEL: uitofp_v4i64_v4f64: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsrlq $32, %ymm0, %ymm1 -; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1 -; AVX512F-32-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-32-NEXT: vsubpd %ymm2, %ymm1, %ymm1 -; AVX512F-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX512F-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX512F-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0 -; AVX512F-32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: retl -; ; AVX512F-64-LABEL: uitofp_v4i64_v4f64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512F-64-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-64-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX512F-64-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-64-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX512F-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512F-64-NEXT: vmovq %xmm1, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vmovq %xmm0, %rax +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq ; -; AVX512VL-32-LABEL: uitofp_v4i64_v4f64: -; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm1, %ymm1 -; AVX512VL-32-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512VL-32-NEXT: vpor {{\.LCPI.*}}, %ymm0, %ymm0 -; AVX512VL-32-NEXT: vsubpd {{\.LCPI.*}}{1to4}, %ymm0, %ymm0 -; AVX512VL-32-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512VL-32-NEXT: retl -; ; AVX512VL-64-LABEL: uitofp_v4i64_v4f64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-64-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 -; AVX512VL-64-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512VL-64-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-64-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512VL-64-NEXT: vmovq %xmm1, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vmovq %xmm0, %rax +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_v4i64_v4f64: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -362,22 +362,120 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-32-LABEL: uitofp_v8i64_v8f64: ; NODQ-32: # %bb.0: -; NODQ-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200] -; NODQ-32-NEXT: vpternlogq $248, {{\.LCPI.*}}, %zmm0, %zmm1 -; NODQ-32-NEXT: vpsrlq $32, %zmm0, %zmm0 -; NODQ-32-NEXT: vporq {{\.LCPI.*}}, %zmm0, %zmm0 -; NODQ-32-NEXT: vsubpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 -; NODQ-32-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; NODQ-32-NEXT: pushl %ebp +; NODQ-32-NEXT: .cfi_def_cfa_offset 8 +; NODQ-32-NEXT: .cfi_offset %ebp, -8 +; NODQ-32-NEXT: movl %esp, %ebp +; NODQ-32-NEXT: .cfi_def_cfa_register %ebp +; NODQ-32-NEXT: andl $-8, %esp +; NODQ-32-NEXT: subl $128, %esp +; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextractps $1, %xmm2, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $3, %xmm2, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $1, %xmm3, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $3, %xmm3, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $1, %xmm0, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl (%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $3, %xmm0, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $1, %xmm1, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vextractps $3, %xmm1, %eax +; NODQ-32-NEXT: shrl $31, %eax +; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) +; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) +; NODQ-32-NEXT: wait +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; NODQ-32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; NODQ-32-NEXT: movl %ebp, %esp +; NODQ-32-NEXT: popl %ebp +; NODQ-32-NEXT: .cfi_def_cfa %esp, 4 ; NODQ-32-NEXT: retl ; ; NODQ-64-LABEL: uitofp_v8i64_v8f64: ; NODQ-64: # %bb.0: -; NODQ-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-64-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; NODQ-64-NEXT: vpsrlq $32, %zmm0, %zmm0 -; NODQ-64-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; NODQ-64-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; NODQ-64-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; NODQ-64-NEXT: vmovq %xmm1, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 +; NODQ-64-NEXT: vmovq %xmm2, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vmovq %xmm2, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vmovq %xmm0, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-64-NEXT: retq ; ; DQ-LABEL: uitofp_v8i64_v8f64: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6761,21 +6761,34 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovnsq %rdi, %rcx +; CHECK-NEXT: cvtsi2sd %rcx, %xmm0 +; CHECK-NEXT: jns .LBB169_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addsd %xmm0, %xmm0 +; CHECK-NEXT: .LBB169_2: # %entry ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: orq %rax, %rcx +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: cmovnsq %rdi, %rcx +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-NEXT: jns .LBB169_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB169_2: # %entry ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64: @@ -6906,35 +6919,77 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: por {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: addpd %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm0 +; CHECK-NEXT: jns .LBB173_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addsd %xmm0, %xmm0 +; CHECK-NEXT: .LBB173_2: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm1 +; CHECK-NEXT: jns .LBB173_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addsd %xmm1, %xmm1 +; CHECK-NEXT: .LBB173_4: # %entry +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-NEXT: jns .LBB173_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB173_2: # %entry +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0 +; AVX1-NEXT: jns .LBB173_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB173_4: # %entry +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v2f64_v2i64: @@ -7124,51 +7179,91 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] -; CHECK-NEXT: subpd %xmm3, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-NEXT: addpd %xmm1, %xmm0 -; CHECK-NEXT: movq %rsi, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-NEXT: subpd %xmm3, %xmm4 -; CHECK-NEXT: movapd %xmm4, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; CHECK-NEXT: addpd %xmm4, %xmm1 -; CHECK-NEXT: movq %rdx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-NEXT: subpd %xmm3, %xmm4 -; CHECK-NEXT: movapd %xmm4, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; CHECK-NEXT: addpd %xmm4, %xmm2 -; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovnsq %rdi, %rcx +; CHECK-NEXT: cvtsi2sd %rcx, %xmm0 +; CHECK-NEXT: jns .LBB177_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addsd %xmm0, %xmm0 +; CHECK-NEXT: .LBB177_2: # %entry +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovnsq %rsi, %rcx +; CHECK-NEXT: cvtsi2sd %rcx, %xmm1 +; CHECK-NEXT: jns .LBB177_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addsd %xmm1, %xmm1 +; CHECK-NEXT: .LBB177_4: # %entry +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovnsq %rdx, %rcx +; CHECK-NEXT: cvtsi2sd %rcx, %xmm2 +; CHECK-NEXT: jns .LBB177_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: addsd %xmm2, %xmm2 +; CHECK-NEXT: .LBB177_6: # %entry +; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] -; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0] -; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-NEXT: jns .LBB177_2 +; AVX1-NEXT: # %bb.1: +; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB177_2: # %entry +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm2 +; AVX1-NEXT: jns .LBB177_4 +; AVX1-NEXT: # %bb.3: +; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB177_4: # %entry +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: orq %rcx, %rdx +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: cmovnsq %rax, %rdx +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm3, %xmm0 +; AVX1-NEXT: jns .LBB177_6 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: .LBB177_6: # %entry +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: @@ -7381,51 +7476,117 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: pand %xmm2, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] -; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: psrlq $32, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; CHECK-NEXT: por %xmm5, %xmm1 -; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; CHECK-NEXT: subpd %xmm6, %xmm1 -; CHECK-NEXT: addpd %xmm3, %xmm1 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 -; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm5, %xmm0 -; CHECK-NEXT: subpd %xmm6, %xmm0 -; CHECK-NEXT: addpd %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm0 +; CHECK-NEXT: jns .LBB181_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addsd %xmm0, %xmm0 +; CHECK-NEXT: .LBB181_2: # %entry +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; CHECK-NEXT: movq %xmm2, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: cvtsi2sd %rdx, %xmm3 +; CHECK-NEXT: jns .LBB181_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: addsd %xmm3, %xmm3 +; CHECK-NEXT: .LBB181_4: # %entry +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 +; CHECK-NEXT: jns .LBB181_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: addsd %xmm2, %xmm2 +; CHECK-NEXT: .LBB181_6: # %entry +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq %rcx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: cmovnsq %rax, %rdx +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rdx, %xmm1 +; CHECK-NEXT: jns .LBB181_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: addsd %xmm1, %xmm1 +; CHECK-NEXT: .LBB181_8: # %entry +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: movapd %xmm2, %xmm1 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_uitofp_v4f64_v4i64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v4f64_v4i64: