Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -2073,11 +2073,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2107,11 +2127,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2141,11 +2181,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2175,11 +2235,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfnmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2203,11 +2283,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_ps_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmaddsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2231,11 +2331,31 @@ [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddps256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_ps_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddps128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], + [IntrNoMem]>; def int_x86_fma_mask_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], + [IntrNoMem]>; + def int_x86_fma_mask_vfmsubadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], + [IntrNoMem]>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16982,6 +16982,16 @@ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, PassThru); } + case FMA_OP_MASK: + { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); + } default: break; } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -67,9 +67,9 @@ PatFrag MemOpFrag = !if (!eq (NumElts#EltTypeName, "1f32"), !cast("memopfsf32"), !if (!eq (NumElts#EltTypeName, "1f64"), !cast("memopfsf64"), - !if (!eq (TypeVariantName, "f"), !cast("memop" # VTName), - !if (!eq (EltTypeName, "i64"), !cast("memop" # VTName), - !if (!eq (VTName, "v16i32"), !cast("memop" # VTName), ?))))); + !if (!eq (TypeVariantName, "f"), !cast("memopu" # VTName), + !if (!eq (EltTypeName, "i64"), !cast("memopu" # VTName), + !if (!eq (VTName, "v16i32"), !cast("memopu" # VTName), ?))))); // The corresponding float type, e.g. v16f32 for v16i32 // Note: For EltSize < 32, FloatVT is illegal and TableGen @@ -954,15 +954,15 @@ EVEX_4V; } -defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopuv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopuv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in -defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopuv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopuv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -1037,16 +1037,16 @@ EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopuv16i32, i512mem, X86VPermiv3, v16i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopuv8i64, i512mem, X86VPermiv3, v8i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopuv16f32, i512mem, X86VPermiv3, v16f32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopuv8f64, i512mem, X86VPermiv3, v8f64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1066,16 +1066,16 @@ (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; } -defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopuv16i32, i512mem, X86VPermv3, v16i32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopuv8i64, i512mem, X86VPermv3, v8i64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopuv16f32, i512mem, X86VPermv3, v16f32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopuv8f64, i512mem, X86VPermv3, v8f64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1103,12 +1103,12 @@ let ExeDomain = SSEPackedSingle in defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, + memopuv16f32, vselect, v16f32>, EVEX_CD8<32, CD8VF>, EVEX_V512; let ExeDomain = SSEPackedDouble in defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, + memopuv8f64, vselect, v8f64>, VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), @@ -1123,12 +1123,12 @@ defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, + memopuv16i32, vselect, v16i32>, EVEX_CD8<32, CD8VF>, EVEX_V512; defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, + memopuv8i64, vselect, v8i64>, VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), @@ -2962,12 +2962,12 @@ SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", + memopuv8i64, i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", + memopuv8i64, i512mem, loadi64, i64mem, "{1to8}", SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), @@ -3053,16 +3053,16 @@ d>, EVEX_4V; } -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64, +defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopuv8f64, VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64, +defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopuv8f64, VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64, +defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopuv8f64, VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64, +defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopuv8f64, VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3082,16 +3082,16 @@ IIC_SSE_UNPCK>, EVEX_4V; } defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, memopuv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, memopuv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, memopuv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, memopuv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - PSHUFD @@ -3116,7 +3116,7 @@ (i8 imm:$src2))))]>, EVEX; } -defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, +defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopuv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; //===----------------------------------------------------------------------===// @@ -3250,18 +3250,18 @@ } defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, - memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512, + memopuv16i32, X86testm, v16i32>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, - memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W, + memopuv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let Predicates = [HasCDI] in { defm VPTESTNMDZ : avx512_vptest<0x27, "vptestnmd", VK16, VR512, f512mem, - memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, + memopuv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPTESTNMQZ : avx512_vptest<0x27, "vptestnmq", VK8, VR512, f512mem, - memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W, + memopuv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -3301,7 +3301,7 @@ defm rm : AVX512_maskable, AVX512BIBase, EVEX_4V; } @@ -3392,7 +3392,7 @@ (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>, +defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopuv8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), (VMOVDDUPZrm addr:$src)>; @@ -3413,17 +3413,17 @@ } defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, memopuv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, memopuv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movshdup (memopuv16i32 addr:$src))), (VMOVSHDUPZrm addr:$src)>; def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movsldup (memopuv16i32 addr:$src))), (VMOVSLDUPZrm addr:$src)>; //===----------------------------------------------------------------------===// @@ -3467,61 +3467,58 @@ AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; -} + defm m: AVX512_maskable_3src, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src, + AVX512FMA3Base, EVEX_B; + } } // Constraints = "$src1 = $dst" multiclass avx512_fma3p_forms opc213, bits<8> opc231, string OpcodeStr, X86VectorVTInfo VTI, SDPatternOperator OpNode> { - defm v213 : avx512_fma3p_rm, - EVEX_V512, EVEX_CD8; - - defm v231 : avx512_fma3p_rm, - EVEX_V512, EVEX_CD8; + defm v213r : avx512_fma3p_rm, EVEX_CD8; + + defm v231r : avx512_fma3p_rm, EVEX_CD8; } +multiclass avx512_fma3p opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode> { let ExeDomain = SSEPackedSingle in { - defm VFMADDPSZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v16f32_info, X86Fmadd>; - defm VFMSUBPSZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v16f32_info, X86Fmsub>; - defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v16f32_info, X86Fmaddsub>; - defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v16f32_info, X86Fmsubadd>; - defm VFNMADDPSZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v16f32_info, X86Fnmadd>; - defm VFNMSUBPSZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v16f32_info, X86Fnmsub>; -} + defm NAME##PSZ : avx512_fma3p_forms, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADDPDZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", - v8f64_info, X86Fmadd>, VEX_W; - defm VFMSUBPDZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", - v8f64_info, X86Fmsub>, VEX_W; - defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", - v8f64_info, X86Fmaddsub>, VEX_W; - defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", - v8f64_info, X86Fmsubadd>, VEX_W; - defm VFNMADDPDZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", - v8f64_info, X86Fnmadd>, VEX_W; - defm VFNMSUBPDZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", - v8f64_info, X86Fnmsub>, VEX_W; + defm NAME##PDZ : avx512_fma3p_forms, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; + let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_m132 opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { @@ -3543,47 +3540,36 @@ } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, - v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, - v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { multiclass avx512_fma3s_rm opc, string OpcodeStr, SDNode OpNode, @@ -3924,12 +3910,12 @@ } defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround, - memopv8f64, f512mem, v8f32, v8f64, + memopuv8f64, f512mem, v8f32, v8f64, SSEPackedSingle>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, - memopv4f64, f256mem, v8f64, v8f32, + memopuv4f64, f256mem, v8f64, v8f32, SSEPackedDouble>, EVEX_V512, PS, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), @@ -3948,27 +3934,27 @@ //===----------------------------------------------------------------------===// defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, - memopv8i64, i512mem, v16f32, v16i32, + memopuv8i64, i512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, - memopv4i64, i256mem, v8f64, v8i32, + memopuv4i64, i256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, - memopv16f32, f512mem, v16i32, v16f32, + memopuv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + memopuv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, - memopv16f32, f512mem, v16i32, v16f32, + memopuv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -3978,7 +3964,7 @@ (VCVTTPS2UDQZrr VR512:$src)>; defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, - memopv8f64, f512mem, v8i32, v8f64, + memopuv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3988,12 +3974,12 @@ (VCVTTPD2UDQZrr VR512:$src)>; defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, - memopv4i64, f256mem, v8f64, v8i32, + memopuv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, - memopv16i32, f512mem, v16f32, v16i32, + memopuv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; @@ -4048,10 +4034,10 @@ } defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, PD, + memopuv16f32, f512mem, SSEPackedSingle>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, + memopuv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src), @@ -4063,10 +4049,10 @@ (VCVTPD2DQZrrb VR512:$src, imm:$rc)>; defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, + memopuv16f32, f512mem, SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, VEX_W, + memopuv8f64, f512mem, SSEPackedDouble>, VEX_W, PS, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src), @@ -4620,7 +4606,7 @@ defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - memopv16f32, SSEPackedSingle>, EVEX_V512, + memopuv16f32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), @@ -4630,7 +4616,7 @@ defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - memopv8f64, SSEPackedDouble>, EVEX_V512, + memopuv8f64, SSEPackedDouble>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), @@ -4828,35 +4814,35 @@ } defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + memopuv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + memopuv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + memopuv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + memopuv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + memopuv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + memopuv2i64, i128mem, v16i32, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VQ>; defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + memopuv2i64, i128mem, v8i64, v16i8>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + memopuv4i64, i256mem, v16i32, v16i16>, EVEX_V512, EVEX_CD8<16, CD8VH>; defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + memopuv2i64, i128mem, v8i64, v8i16>, EVEX_V512, EVEX_CD8<16, CD8VQ>; defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + memopuv4i64, i256mem, v8i64, v8i32>, EVEX_V512, EVEX_CD8<32, CD8VH>; //===----------------------------------------------------------------------===// @@ -5009,21 +4995,21 @@ EVEX_4V, Sched<[WriteShuffle]>; } -defm VSHUFPSZ : avx512_shufp, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v16i32 (X86Shufp VR512:$src1, - (memopv16i32 addr:$src2), (i8 imm:$imm))), + (memopuv16i32 addr:$src2), (i8 imm:$imm))), (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, - (memopv8i64 addr:$src2), (i8 imm:$imm))), + (memopuv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; multiclass avx512_valign { @@ -5222,11 +5208,11 @@ (VPLZCNTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; -def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))), +def : Pat<(v16i32 (ctlz (memopuv16i32 addr:$src))), (VPLZCNTDrm addr:$src)>; def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))), +def : Pat<(v8i64 (ctlz (memopuv8i64 addr:$src))), (VPLZCNTQrm addr:$src)>; def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), (VPLZCNTQrr VR512:$src)>; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -451,10 +451,16 @@ def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; // 512-bit memop pattern fragments -def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; -def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; -def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; +def memopuv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; +def memopuv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; +def memopuv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; +def memopuv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; +def memopuv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop8 node:$ptr))>; +def memopuv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop8 node:$ptr))>; +def memopuv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop4 node:$ptr))>; +def memopuv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop8 node:$ptr))>; +def memopuv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop4 node:$ptr))>; +def memopuv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop8 node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -21,7 +21,7 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_SCALAR_MASK_RM, + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_TO_REG, COMPRESS_TO_MEM }; @@ -332,6 +332,30 @@ X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB , 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), Index: test/CodeGen/X86/avx512-fma-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-fma-intrinsics.ll +++ test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -8,6 +8,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmadd_ps + ; CHECK: vfmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmadd_pd_z ; CHECK: vfmadd213pd %zmm @@ -32,6 +39,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_ps + ; CHECK: vfmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubpd_z ; CHECK: vfmsub213pd %zmm @@ -40,6 +54,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_pd + ; CHECK: vfmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_ps_z ; CHECK: vfnmadd213ps %zmm @@ -48,6 +69,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_ps + ; CHECK: vfnmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_pd_z ; CHECK: vfnmadd213pd %zmm @@ -56,6 +84,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_pd + ; CHECK: vfnmadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmsubps_z ; CHECK: vfnmsub213ps %zmm @@ -64,6 +99,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_ps + ; CHECK: vfnmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmsubpd_z ; CHECK: vfnmsub213pd %zmm @@ -72,6 +114,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_pd + ; CHECK: vfnmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubps_z ; CHECK: vfmaddsub213ps %zmm @@ -96,6 +145,13 @@ } declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub_pd + ; CHECK: vfmaddsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddps_z ; CHECK: vfmsubadd213ps %zmm @@ -104,6 +160,13 @@ } declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_ps + ; CHECK: vfmsubadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind + ret <16 x float> %res +} + define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddpd_z ; CHECK: vfmsubadd213pd %zmm @@ -111,3 +174,11 @@ ret <8 x double> %res } declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_pd + ; CHECK: vfmsubadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -611,3 +611,328 @@ } declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_ps + ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd256_pd: +; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd128_pd: +; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_ps + ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_ps + ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_pd + ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_pd + ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_ps + ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_ps + ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_pd + ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_pd + ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_ps + ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_ps + ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_pd + ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_pd + ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub256_ps: +; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub128_ps: +; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub256_pd + ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub128_pd + ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_ps + ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_ps + ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_pd + ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} +declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_pd + ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd + ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubaddrm_pd + ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07] + %a2 = load <8 x double>* %ptr_a2, align 8 + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_r + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rz + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 8 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 4 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} \ No newline at end of file