Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -2073,11 +2073,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2107,11 +2127,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2141,11 +2181,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2175,11 +2235,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2203,11 +2283,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2231,11 +2331,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16988,6 +16988,16 @@ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, PassThru); } + case FMA_OP_MASK: + { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); + } default: break; } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3475,61 +3475,58 @@ AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; -} + defm m: AVX512_maskable_3src, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src, + AVX512FMA3Base, EVEX_B; + } } // Constraints = "$src1 = $dst" multiclass avx512_fma3p_forms opc213, bits<8> opc231, string OpcodeStr, X86VectorVTInfo VTI, SDPatternOperator OpNode> { - defm v213 : avx512_fma3p_rm, - EVEX_V512, EVEX_CD8; - - defm v231 : avx512_fma3p_rm, - EVEX_V512, EVEX_CD8; + defm v213r : avx512_fma3p_rm, EVEX_CD8; + + defm v231r : avx512_fma3p_rm, EVEX_CD8; } +multiclass avx512_fma3p opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode> { let ExeDomain = SSEPackedSingle in { - defm VFMADDPSZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v16f32_info, X86Fmadd>; - defm VFMSUBPSZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v16f32_info, X86Fmsub>; - defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v16f32_info, X86Fmaddsub>; - defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v16f32_info, X86Fmsubadd>; - defm VFNMADDPSZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v16f32_info, X86Fnmadd>; - defm VFNMSUBPSZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v16f32_info, X86Fnmsub>; -} + defm NAME##PSZ : avx512_fma3p_forms, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADDPDZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v8f64_info, X86Fmadd>, VEX_W; - defm VFMSUBPDZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v8f64_info, X86Fmsub>, VEX_W; - defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v8f64_info, X86Fmaddsub>, VEX_W; - defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v8f64_info, X86Fmsubadd>, VEX_W; - defm VFNMADDPDZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v8f64_info, X86Fnmadd>, VEX_W; - defm VFNMSUBPDZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v8f64_info, X86Fnmsub>, VEX_W; + defm NAME##PDZ : avx512_fma3p_forms, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; + let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_m132 opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { @@ -3551,47 +3548,36 @@ } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -21,7 +21,7 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM, + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM }; @@ -380,6 +380,30 @@ X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB , 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), Index: test/CodeGen/X86/avx512-fma-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-fma-intrinsics.ll +++ test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -8,6 +8,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmadd_ps + ; CHECK: vfmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmadd_pd_z ; CHECK: vfmadd213pd %zmm @@ -32,6 +39,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_ps + ; CHECK: vfmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubpd_z ; CHECK: vfmsub213pd %zmm @@ -40,6 +54,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_pd + ; CHECK: vfmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_ps_z ; CHECK: vfnmadd213ps %zmm @@ -48,6 +69,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_ps + ; CHECK: vfnmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_pd_z ; CHECK: vfnmadd213pd %zmm @@ -56,6 +84,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_pd + ; CHECK: vfnmadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmsubps_z ; CHECK: vfnmsub213ps %zmm @@ -64,6 +99,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_ps + ; CHECK: vfnmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmsubpd_z ; CHECK: vfnmsub213pd %zmm @@ -72,6 +114,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_pd + ; CHECK: vfnmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubps_z ; CHECK: vfmaddsub213ps %zmm @@ -96,6 +145,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub_pd + ; CHECK: vfmaddsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddps_z ; CHECK: vfmsubadd213ps %zmm @@ -104,6 +160,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_ps + ; CHECK: vfmsubadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddpd_z ; CHECK: vfmsubadd213pd %zmm @@ -111,3 +174,11 @@ ret <8 x double> %res } declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_pd + ; CHECK: vfmsubadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -611,3 +611,388 @@ } declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_ps + ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd256_pd: +; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd128_pd: +; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_ps + ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_ps + ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_pd + ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_pd + ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_ps + ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_ps + ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_pd + ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_pd + ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_ps + ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_ps + ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_pd + ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_pd + ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub256_ps: +; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub128_ps: +; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub256_pd + ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub128_pd + ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_ps + ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_ps + ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_pd + ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} +declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_pd + ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd + ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubaddrm_pd + ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07] + %a2 = load <8 x double>* %ptr_a2, align 8 + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_r + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rz + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 8 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 4 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_r + ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rz + ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk + ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz + ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + ret <2 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_r + ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rz + ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk + ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] + %a2 = load <4 x double>* %ptr_a2 + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz + ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07] + %a2 = load <4 x double>* %ptr_a2 + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + ret <4 x double> %res +}