diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29009,6 +29009,48 @@ } return; } + if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && + Subtarget.hasAVX() && !Subtarget.hasAVX512()) { + // TODO Any SSE41+ subtarget should work here but BLENDV codegen ends up + // a lot worse than it should be. + SDValue Zero = DAG.getConstant(0, dl, SrcVT); + SDValue One = DAG.getConstant(1, dl, SrcVT); + SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, + DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), + DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); + SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); + SmallVector SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); + for (int i = 0; i != 2; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + SignSrc, DAG.getIntPtrConstant(i, dl)); + if (IsStrict) + SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, + {MVT::f32, MVT::Other}, + {N->getOperand(0), Src}); + else + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src); + }; + SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SignCvts[0].getValue(1), SignCvts[1].getValue(1)); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); + } + IsNeg = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, IsNeg); + IsNeg = DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); + SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); + Results.push_back(Cvt); + if (IsStrict) + Results.push_back(Chain); + return; + } + if (SrcVT != MVT::v2i32) return; diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -311,33 +311,20 @@ ; ; AVX1-64-LABEL: uitofp_v2i64_v2f32: ; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-64-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-64-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-64-NEXT: jns .LBB3_2 -; AVX1-64-NEXT: # %bb.1: -; AVX1-64-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-64-NEXT: .LBB3_2: +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0 -; AVX1-64-NEXT: jns .LBB3_4 -; AVX1-64-NEXT: # %bb.3: -; AVX1-64-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-64-NEXT: .LBB3_4: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; AVX1-64-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX1-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-64-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-64-NEXT: retq ; ; AVX512F-64-LABEL: uitofp_v2i64_v2f32: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1953,35 +1953,20 @@ ; ; VEX-LABEL: uitofp_2i64_to_4f32: ; VEX: # %bb.0: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; VEX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; VEX-NEXT: vpsrlq $1, %xmm0, %xmm3 +; VEX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; VEX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB41_3 -; VEX-NEXT: .LBB41_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB41_3: +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB41_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; VEX-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; VEX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_4f32: @@ -2100,35 +2085,21 @@ ; ; VEX-LABEL: uitofp_2i64_to_2f32: ; VEX: # %bb.0: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB42_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB42_3 -; VEX-NEXT: .LBB42_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB42_3: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; VEX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; VEX-NEXT: vpsrlq $1, %xmm0, %xmm3 +; VEX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; VEX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB42_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB42_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; VEX-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; VEX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_2f32: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6939,33 +6939,20 @@ ; ; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: ; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-NEXT: jns .LBB174_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB174_2: # %entry +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0 -; AVX1-NEXT: jns .LBB174_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: .LBB174_4: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: