diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -703,12 +703,14 @@ // Conversions between float and half-float. CVTPS2PH, + CVTPS2PH_SAE, CVTPH2PS, CVTPH2PS_SAE, // Masked version of above. // SRC, RND, PASSTHRU, MASK MCVTPS2PH, + MCVTPS2PH_SAE, // Galois Field Arithmetic Instructions GF2P8AFFINEINVQB, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27152,15 +27152,26 @@ SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + unsigned RC = 0; + unsigned Opc = IntrData->Opc0; + bool SAE = Src.getValueType().is512BitVector() && + (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd)); + if (SAE) { + Opc = X86ISD::CVTPS2PH_SAE; + Rnd = DAG.getTargetConstant(RC, dl, MVT::i32); + } + if (isAllOnesConstant(Mask)) - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd); + return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd); + if (SAE) + Opc = X86ISD::MCVTPS2PH_SAE; + else + Opc = IntrData->Opc1; MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd, - PassThru, Mask); - + return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask); } case CVTNEPS2BF16_MASK: { SDValue Src = Op.getOperand(1); @@ -33841,7 +33852,9 @@ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) NODE_NAME_CASE(CVTPS2PH) NODE_NAME_CASE(STRICT_CVTPS2PH) + NODE_NAME_CASE(CVTPS2PH_SAE) NODE_NAME_CASE(MCVTPS2PH) + NODE_NAME_CASE(MCVTPS2PH_SAE) NODE_NAME_CASE(CVTPH2PS) NODE_NAME_CASE(STRICT_CVTPH2PS) NODE_NAME_CASE(CVTPH2PS_SAE) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -9181,12 +9181,29 @@ multiclass avx512_cvtps2ph_sae { - let hasSideEffects = 0, Uses = [MXCSR] in - defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, - (outs _dest.RC:$dst), - (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>, - EVEX_B, AVX512AIi8Base, Sched<[Sched]>; + let hasSideEffects = 0, Uses = [MXCSR] in { + def rrb : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, {sae}, $src1, $dst|$dst, $src1, {sae}, $src2}", + [(set _dest.RC:$dst, + (X86cvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, + EVEX_B, Sched<[Sched]>; + let Constraints = "$src0 = $dst" in + def rrbk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}}|$dst {${mask}}, $src1, {sae}, $src2}", + [(set _dest.RC:$dst, + (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2), + _dest.RC:$src0, _src.KRCWM:$mask))]>, + EVEX_B, Sched<[Sched]>, EVEX_K; + def rrbkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, {sae}, $src2}", + [(set _dest.RC:$dst, + (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2), + _dest.ImmAllZerosV, _src.KRCWM:$mask))]>, + EVEX_B, Sched<[Sched]>, EVEX_KZ; +} } let Predicates = [HasAVX512] in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -762,13 +762,17 @@ [(X86strict_cvtps2ph node:$src1, node:$src2), (X86cvtps2ph node:$src1, node:$src2)]>; -def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", - SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>, - SDTCVecEltisVT<1, f32>, - SDTCisVT<2, i32>, - SDTCisSameAs<0, 3>, - SDTCVecEltisVT<4, i1>, - SDTCisSameNumEltsAs<1, 4>]> >; +def X86cvtps2phSAE : SDNode<"X86ISD::CVTPS2PH_SAE", SDTcvtps2ph>; + +def SDTmcvtps2ph : SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisVT<2, i32>, + SDTCisSameAs<0, 3>, + SDTCVecEltisVT<4, i1>, + SDTCisSameNumEltsAs<1, 4>]>; +def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTmcvtps2ph>; +def X86mcvtps2phSAE : SDNode<"X86ISD::MCVTPS2PH_SAE", SDTmcvtps2ph>; + def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisFP<1>, SDTCisVec<1>, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1011,8 +1011,8 @@ ; X64-LABEL: test_x86_vcvtps2ph_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} -; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} +; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} +; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi) ; X64-NEXT: vmovdqa %ymm1, %ymm0 @@ -1022,15 +1022,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} -; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} +; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} +; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax) ; X86-NEXT: vmovdqa %ymm1, %ymm0 ; X86-NEXT: retl %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) - %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask) + %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask) store <16 x i16> %res1, ptr %dst %res = add <16 x i16> %res2, %res3 ret <16 x i16> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4356,8 +4356,8 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] -; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02] -; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] +; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] +; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] ; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -4366,14 +4366,14 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] -; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02] -; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] +; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] +; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] ; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) - %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask) %res0 = add <8 x i16> %res1, %res2 %res = add <8 x i16> %res3, %res0 ret <8 x i16> %res @@ -4387,8 +4387,8 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] -; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02] -; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] +; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] +; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] ; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -4398,15 +4398,15 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] -; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02] -; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] +; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] +; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] ; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) - %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask) + %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask) %res0 = add <8 x i16> %res1, %res2 %res = add <8 x i16> %res3, %res0 ret <8 x i16> %res