Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -4511,6 +4511,36 @@ def int_x86_avx512_mask_pavg_w_256 : GCCBuiltin<"__builtin_ia32_pavgw256_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_128 : + GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_256 : + GCCBuiltin<"__builtin_ia32_pmaddwd256_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_512 : + GCCBuiltin<"__builtin_ia32_pmaddwd512_mask">, + Intrinsic<[llvm_v16i32_ty], + [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_128 : + GCCBuiltin<"__builtin_ia32_pmaddubsw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_256 : + GCCBuiltin<"__builtin_ia32_pmaddubsw256_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_512 : + GCCBuiltin<"__builtin_ia32_pmaddubsw512_mask">, + Intrinsic<[llvm_v32i16_ty], + [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrNoMem]>; } // Gather and Scatter ops Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -403,7 +403,8 @@ PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale MULHRS, - + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, // FMA nodes FMADD, FNMADD, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19008,6 +19008,8 @@ case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3104,7 +3104,14 @@ (bitconvert (_Src.LdFrag addr:$src2)))), itins.rm>, AVX512BIBase, EVEX_4V; + } +} +multiclass avx512_binop_rmb2 opc, string OpcodeStr, OpndItins itins, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst, bit IsCommutable = 0> : + avx512_binop_rm2 { + let mayLoad = 1 in defm rmb : AVX512_maskable, AVX512BIBase, EVEX_4V, EVEX_B; - } } defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, @@ -3145,27 +3151,50 @@ defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, SSE_INTALU_ITINS_P, HasBWI, 1>; -multiclass avx512_binop_all opc, string OpcodeStr, OpndItins itins, +multiclass avx512_binop_rmb_all opc, string OpcodeStr, OpndItins itins, SDNode OpNode, bit IsCommutable = 0> { - defm NAME#Z : avx512_binop_rm2, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; let Predicates = [HasVLX] in { - defm NAME#Z256 : avx512_binop_rm2, EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; - defm NAME#Z128 : avx512_binop_rm2, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } } -defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, +multiclass avx512_binop_rm_all opc, string OpcodeStr,SDNode OpNode, + AVX512VLVectorVTInfo _Src, AVX512VLVectorVTInfo _Dst, + Predicate prd, bit IsCommutable = 0, + OpndItins itins = SSE_INTALU_ITINS_P> { + let Predicates = [prd] in + defm NAME#Z : avx512_binop_rm2, + EVEX_V512, EVEX_CD8<_Src.info512.EltSize, CD8VF>; + let Predicates = [HasVLX, prd] in { + defm NAME#Z256 : avx512_binop_rm2, + EVEX_V256, EVEX_CD8<_Src.info256.EltSize, CD8VF>; + defm NAME#Z128 : avx512_binop_rm2, + EVEX_V128, EVEX_CD8<_Src.info128.EltSize, CD8VF>; + } +} + +defm VPMULDQ : avx512_binop_rmb_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, X86pmuldq, 1>,T8PD; -defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, +defm VPMULUDQ : avx512_binop_rmb_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, X86pmuludq, 1>; +defm VPMADDUBSW : avx512_binop_rm_all<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info, HasBWI, 1>, T8PD; +defm VPMADDWD : avx512_binop_rm_all<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info, HasBWI, 1>; + multiclass avx512_packs_rmb opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { let mayLoad = 1 in { @@ -6598,3 +6627,5 @@ (bc_v8i64 (v8i1sextv8i64)), (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), (VPABSQZrr VR512:$src)>; + + Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -288,6 +288,9 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -596,6 +596,18 @@ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1038,3 +1038,36 @@ %res2 = add <32 x i16> %res, %res1 ret <32 x i16> %res2 } + +declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1] +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x04,0xc1] +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1] +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xc1] +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3843,3 +3843,71 @@ %res2 = add <16 x i16> %res, %res1 ret <16 x i16> %res2 } + +declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax ## encoding: [0x40,0x0f,0xb6,0xc7] +; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1] +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf5,0xc1] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax ## encoding: [0x40,0x0f,0xb6,0xc7] +; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1] +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf5,0xc1] +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax ## encoding: [0x40,0x0f,0xb6,0xc7] +; CHECK-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1] +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x04,0xc1] +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1] +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x04,0xc1] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + Index: test/MC/X86/x86-64-avx512bw.s =================================================================== --- test/MC/X86/x86-64-avx512bw.s +++ test/MC/X86/x86-64-avx512bw.s @@ -3776,3 +3776,75 @@ // CHECK: encoding: [0x62,0xe2,0x25,0x40,0x0b,0xaa,0xc0,0xdf,0xff,0xff] vpmulhrsw -8256(%rdx), %zmm27, %zmm21 +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x02,0x5d,0x40,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 + +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} +// CHECK: encoding: [0x62,0x02,0x5d,0x43,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} + +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z} +// CHECK: encoding: [0x62,0x02,0x5d,0xc3,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z} + +// CHECK: vpmaddubsw (%rcx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x19] + vpmaddubsw (%rcx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x22,0x5d,0x40,0x04,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 8128(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x7f] + vpmaddubsw 8128(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 8192(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0x00,0x20,0x00,0x00] + vpmaddubsw 8192(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw -8192(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x80] + vpmaddubsw -8192(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw -8256(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0xc0,0xdf,0xff,0xff] + vpmaddubsw -8256(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x01,0x4d,0x40,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} +// CHECK: encoding: [0x62,0x01,0x4d,0x42,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z} +// CHECK: encoding: [0x62,0x01,0x4d,0xc2,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z} + +// CHECK: vpmaddwd (%rcx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x11] + vpmaddwd (%rcx), %zmm22, %zmm26 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x21,0x4d,0x40,0xf5,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26 + +// CHECK: vpmaddwd 8128(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x7f] + vpmaddwd 8128(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd 8192(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0x00,0x20,0x00,0x00] + vpmaddwd 8192(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd -8192(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x80] + vpmaddwd -8192(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd -8256(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0xc0,0xdf,0xff,0xff] + vpmaddwd -8256(%rdx), %zmm22, %zmm26 + Index: test/MC/X86/x86-64-avx512bw_vl.s =================================================================== --- test/MC/X86/x86-64-avx512bw_vl.s +++ test/MC/X86/x86-64-avx512bw_vl.s @@ -6799,3 +6799,291 @@ // CHECK: encoding: [0x62,0x62,0x5d,0x20,0x0b,0xa2,0xe0,0xef,0xff,0xff] vpmulhrsw -4128(%rdx), %ymm20, %ymm28 +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 + +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} +// CHECK: encoding: [0x62,0x22,0x55,0x06,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} + +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z} +// CHECK: encoding: [0x62,0x22,0x55,0x86,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z} + +// CHECK: vpmaddubsw (%rcx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x21] + vpmaddubsw (%rcx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 2032(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x7f] + vpmaddubsw 2032(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 2048(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0x00,0x08,0x00,0x00] + vpmaddubsw 2048(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw -2048(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x80] + vpmaddubsw -2048(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw -2064(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0xf0,0xf7,0xff,0xff] + vpmaddubsw -2064(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x02,0x2d,0x20,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} +// CHECK: encoding: [0x62,0x02,0x2d,0x25,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z} +// CHECK: encoding: [0x62,0x02,0x2d,0xa5,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z} + +// CHECK: vpmaddubsw (%rcx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x31] + vpmaddubsw (%rcx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x22,0x2d,0x20,0x04,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 4064(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x7f] + vpmaddubsw 4064(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 4096(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0x00,0x10,0x00,0x00] + vpmaddubsw 4096(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw -4096(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x80] + vpmaddubsw -4096(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw -4128(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0xe0,0xef,0xff,0xff] + vpmaddubsw -4128(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 +// CHECK: encoding: [0x62,0x81,0x3d,0x00,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} +// CHECK: encoding: [0x62,0x81,0x3d,0x01,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z} +// CHECK: encoding: [0x62,0x81,0x3d,0x81,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z} + +// CHECK: vpmaddwd (%rcx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x09] + vpmaddwd (%rcx), %xmm24, %xmm17 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xa1,0x3d,0x00,0xf5,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17 + +// CHECK: vpmaddwd 2032(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x7f] + vpmaddwd 2032(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd 2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0x00,0x08,0x00,0x00] + vpmaddwd 2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd -2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x80] + vpmaddwd -2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd -2064(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0xf0,0xf7,0xff,0xff] + vpmaddwd -2064(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} +// CHECK: encoding: [0x62,0x21,0x45,0x24,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z} +// CHECK: encoding: [0x62,0x21,0x45,0xa4,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z} + +// CHECK: vpmaddwd (%rcx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x01] + vpmaddwd (%rcx), %ymm23, %ymm24 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24 + +// CHECK: vpmaddwd 4064(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x7f] + vpmaddwd 4064(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd 4096(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0x00,0x10,0x00,0x00] + vpmaddwd 4096(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd -4096(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x80] + vpmaddwd -4096(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd -4128(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0xe0,0xef,0xff,0xff] + vpmaddwd -4128(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} +// CHECK: encoding: [0x62,0x82,0x45,0x02,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z} +// CHECK: encoding: [0x62,0x82,0x45,0x82,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z} + +// CHECK: vpmaddubsw (%rcx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x19] + vpmaddubsw (%rcx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0x04,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 2032(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x7f] + vpmaddubsw 2032(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 2048(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0x00,0x08,0x00,0x00] + vpmaddubsw 2048(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw -2048(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x80] + vpmaddubsw -2048(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw -2064(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0xf0,0xf7,0xff,0xff] + vpmaddubsw -2064(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} +// CHECK: encoding: [0x62,0xa2,0x65,0x27,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x65,0xa7,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z} + +// CHECK: vpmaddubsw (%rcx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x09] + vpmaddubsw (%rcx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0x8c,0xf0,0x34,0x12,0x00,0x00] + vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4064(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x7f] + vpmaddubsw 4064(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4096(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0x00,0x10,0x00,0x00] + vpmaddubsw 4096(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw -4096(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x80] + vpmaddubsw -4096(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw -4128(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0xe0,0xef,0xff,0xff] + vpmaddubsw -4128(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} +// CHECK: encoding: [0x62,0xa1,0x4d,0x03,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z} +// CHECK: encoding: [0x62,0xa1,0x4d,0x83,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z} + +// CHECK: vpmaddwd (%rcx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x39] + vpmaddwd (%rcx), %xmm22, %xmm23 + +// CHECK: vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23 + +// CHECK: vpmaddwd 2032(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x7f] + vpmaddwd 2032(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd 2048(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0x00,0x08,0x00,0x00] + vpmaddwd 2048(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd -2048(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x80] + vpmaddwd -2048(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd -2064(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0xf0,0xf7,0xff,0xff] + vpmaddwd -2064(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} +// CHECK: encoding: [0x62,0xa1,0x5d,0x22,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z} +// CHECK: encoding: [0x62,0xa1,0x5d,0xa2,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z} + +// CHECK: vpmaddwd (%rcx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x19] + vpmaddwd (%rcx), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4064(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x7f] + vpmaddwd 4064(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4096(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0x00,0x10,0x00,0x00] + vpmaddwd 4096(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd -4096(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x80] + vpmaddwd -4096(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd -4128(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0xe0,0xef,0xff,0xff] + vpmaddwd -4128(%rdx), %ymm20, %ymm19 +