Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -371,6 +371,9 @@ // Vector multiply packed signed doubleword integers PMULDQ, + // Rounding mode node for all FP arithmetic and conversion intrinsics + ROUNDMODE, + // FMA nodes FMADD, FNMADD, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16950,6 +16950,26 @@ Op.getOperand(2)), Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); } + case INTR_TYPE_3OP_MASK_RM:{ + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue RoundingMode = Op.getOperand(5); + auto *SAE = dyn_cast(RoundingMode); + if (SAE == nullptr) + llvm_unreachable("Rounding mode must be constant value"); + if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION){ + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3), + Mask, Src1, Subtarget, DAG); + } else { + SDValue fmaNode = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3); + return getVectorMaskingNode(DAG.getNode(X86ISD::ROUNDMODE, dl, VT, + fmaNode, RoundingMode), + Mask, Src1, Subtarget, DAG); + } + } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -17219,30 +17239,6 @@ return DAG.getNode(Opcode, dl, VTs, NewOps); } - case Intrinsic::x86_fma_mask_vfmadd_ps_512: - case Intrinsic::x86_fma_mask_vfmadd_pd_512: - case Intrinsic::x86_fma_mask_vfmsub_ps_512: - case Intrinsic::x86_fma_mask_vfmsub_pd_512: - case Intrinsic::x86_fma_mask_vfnmadd_ps_512: - case Intrinsic::x86_fma_mask_vfnmadd_pd_512: - case Intrinsic::x86_fma_mask_vfnmsub_ps_512: - case Intrinsic::x86_fma_mask_vfnmsub_pd_512: - case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: - case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: - case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: - case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { - auto *SAE = cast(Op.getOperand(5)); - if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) - return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), - dl, Op.getValueType(), - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)), - Op.getOperand(4), Op.getOperand(1), - Subtarget, DAG); - else - return SDValue(); - } case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3522,6 +3522,29 @@ } } // Constraints = "$src1 = $dst" +let Constraints = "$src1 = $dst" in { +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3_round_rrb opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { + defm rb: AVX512_maskable_3src, + AVX512FMA3Base, EVEX_B, EVEX_RC; + } +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3_round_forms opc213, bits<8> opc231, string OpcodeStr, + X86VectorVTInfo VTI, SDPatternOperator OpNode> { + defm v213r : avx512_fma3_round_rrb, EVEX_CD8; + + defm v231r : avx512_fma3_round_rrb, EVEX_CD8; +} + multiclass avx512_fma3p_forms opc213, bits<8> opc231, string OpcodeStr, X86VectorVTInfo VTI, SDPatternOperator OpNode> { @@ -3532,12 +3555,13 @@ VTI>, EVEX_CD8; } -multiclass avx512_fma3p opc213, bits<8> opc231, - string OpcodeStr, +multiclass avx512_fma3p opc213, bits<8> opc231, string OpcodeStr, SDPatternOperator OpNode> { let ExeDomain = SSEPackedSingle in { - defm NAME##PSZ : avx512_fma3p_forms, EVEX_V512; + defm NAME##PSZ : avx512_fma3p_forms, + avx512_fma3_round_forms, EVEX_V512; defm NAME##PSZ256 : avx512_fma3p_forms, EVEX_V256; defm NAME##PSZ128 : avx512_fma3p_forms, + avx512_fma3_round_forms, EVEX_V512, VEX_W; defm NAME##PDZ256 : avx512_fma3p_forms, EVEX_V256, VEX_W; @@ -3580,7 +3606,6 @@ } } // Constraints = "$src1 = $dst" - multiclass avx512_fma3p_m132_f opc, string OpcodeStr, SDNode OpNode> { Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -203,6 +203,7 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def SDTRndMode : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<2>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, @@ -258,6 +259,8 @@ def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; +def X86RndMode : SDNode<"X86ISD::ROUNDMODE", SDTRndMode>; + def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -21,8 +21,9 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, - COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_3OP_MASK_RM, FMA_OP_MASK, + INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, + BLEND }; struct IntrinsicData { @@ -400,28 +401,40 @@ X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB , 0), X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), Index: test/CodeGen/X86/avx512-fma-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-fma-intrinsics.ll +++ test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -182,3 +182,283 @@ ret <8 x double> %res } +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne + ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn + ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp + ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz + ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current + ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne + ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn + ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp + ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz + ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current + ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rne + ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtn + ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtp + ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_rtz + ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrb_current + ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rne + ; CHECK: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtn + ; CHECK: vfmsub213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtp + ; CHECK: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_rtz + ; CHECK: vfmsub213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_round_vfmsub512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ; CHECK-LABEL: test_mask_round_vfmsub512_ps_rrbz_current + ; CHECK: vfmsub213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xaa,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne + ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn + ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp + ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz + ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current + ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne + ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn + ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp + ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz + ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current + ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} + + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne + ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn + ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp + ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz + ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current + ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne + ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn + ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp + ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz + ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current + ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +}