diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56680,9 +56680,6 @@ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) return SDValue(); - if (Subtarget.hasFP16()) - return SDValue(); - bool IsStrict = N->isStrictFPOpcode(); EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); @@ -56692,11 +56689,47 @@ SrcVT.getVectorElementType() != MVT::f32) return SDValue(); + SDLoc dl(N); + + SDValue Cvt, Chain; unsigned NumElts = VT.getVectorNumElements(); - if (NumElts == 1 || !isPowerOf2_32(NumElts)) + if (Subtarget.hasFP16()) { + // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..))) + // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..)) + if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) { + SDValue Cvt0, Cvt1; + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + bool IsOp0Strict = Op0->isStrictFPOpcode(); + if (Op0.getOpcode() != Op1.getOpcode() || + Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 || + Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) { + return SDValue(); + } + int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11}; + if (IsStrict) { + assert(IsOp0Strict && "Op0 must be strict node"); + unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP + ? X86ISD::STRICT_CVTSI2P + : X86ISD::STRICT_CVTUI2P; + Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, + {Op0.getOperand(0), Op0.getOperand(1)}); + Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, + {Op1.getOperand(0), Op1.getOperand(1)}); + Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); + return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl); + } + unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P + : X86ISD::CVTUI2P; + Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0)); + Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0)); + return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); + } return SDValue(); + } - SDLoc dl(N); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); // Widen to at least 4 input elements. if (NumElts < 4) @@ -56704,9 +56737,8 @@ DAG.getConstantFP(0.0, dl, SrcVT)); // Destination is v8i16 with at least 8 elements. - EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - std::max(8U, NumElts)); - SDValue Cvt, Chain; + EVT CvtVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts)); SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32); if (IsStrict) { Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other}, diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll --- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -1031,10 +1031,9 @@ define <8 x half> @s64tof16(<8 x i64> %a) #0 { ; CHECK-LABEL: s64tof16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 -; CHECK-NEXT: vcvtqq2ps %ymm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vcvtqq2ph %ymm1, %xmm1 +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %1 = sitofp <8 x i64> %a to <8 x half> @@ -1044,10 +1043,9 @@ define <8 x half> @u64tof16(<8 x i64> %a) #0 { ; CHECK-LABEL: u64tof16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; CHECK-NEXT: vcvtuqq2ps %ymm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vcvtuqq2ph %ymm1, %xmm1 +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %1 = uitofp <8 x i64> %a to <8 x half> diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll @@ -146,10 +146,9 @@ define <8 x half> @sitofp_v8i64_v8f16(<8 x i64> %x) #1 { ; CHECK-LABEL: sitofp_v8i64_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 -; CHECK-NEXT: vcvtqq2ps %ymm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vcvtqq2ph %ymm1, %xmm1 +; CHECK-NEXT: vcvtqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i64(<8 x i64> %x, @@ -161,10 +160,9 @@ define <8 x half> @uitofp_v8i64_v8f16(<8 x i64> %x) #1 { ; CHECK-LABEL: uitofp_v8i64_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; CHECK-NEXT: vcvtuqq2ps %ymm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtps2phx %ymm0, %xmm0 +; CHECK-NEXT: vcvtuqq2ph %ymm1, %xmm1 +; CHECK-NEXT: vcvtuqq2ph %ymm0, %xmm0 +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i64(<8 x i64> %x,