Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -3860,6 +3860,18 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_vfmaddsd3_round : + GCCBuiltin<"__builtin_ia32_vfmaddsd3_round">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_vfmaddss3_round : + GCCBuiltin<"__builtin_ia32_vfmaddss3_round">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask3_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">, Intrinsic<[llvm_v2f64_ty], Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16819,6 +16819,7 @@ Src2, Src1, Src3), Mask, PassThru, Subtarget, DAG); } + case FMA_OP_MASK_SCA_RM: case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -16841,6 +16842,11 @@ // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrData->Type == FMA_OP_MASK_SCA_RM) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, + Op.getValueType(), Src1, Src2, + Src3, Op.getOperand(5)), + Mask, PassThru, Subtarget, DAG); if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); if (cast(Rnd)->getZExtValue() != Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4645,7 +4645,7 @@ string SUFF> { defm NAME#213#SUFF: avx512_fma3s_common; defm NAME#231#SUFF: avx512_fma3s_common; defm NAME#132#SUFF: avx512_fma3s_common %data) ret void } + +declare <2 x double> @llvm.x86.avx512.vfmaddsd3.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_vfmaddsd3_round(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_vfmaddsd3_round: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.vfmaddsd3.round(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.vfmaddsd3.round(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.vfmaddsd3.round(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.vfmaddsd3.round(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.vfmaddss3.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_vfmaddss3_round(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_vfmaddss3_round: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1} +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.vfmaddss3.round(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.vfmaddss3.round(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.vfmaddss3.round(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.vfmaddss3.round(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +