Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -4511,6 +4511,36 @@ def int_x86_avx512_mask_pavg_w_256 : GCCBuiltin<"__builtin_ia32_pavgw256_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_128 : + GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_256 : + GCCBuiltin<"__builtin_ia32_pmaddwd256_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddw_d_512 : + GCCBuiltin<"__builtin_ia32_pmaddwd512_mask">, + Intrinsic<[llvm_v16i32_ty], + [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_128 : + GCCBuiltin<"__builtin_ia32_pmaddubsw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_256 : + GCCBuiltin<"__builtin_ia32_pmaddubsw256_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmaddubs_w_512 : + GCCBuiltin<"__builtin_ia32_pmaddubsw512_mask">, + Intrinsic<[llvm_v32i16_ty], + [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrNoMem]>; } // Gather and Scatter ops Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -403,7 +403,8 @@ PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale MULHRS, - + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, // FMA nodes FMADD, FNMADD, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -19008,6 +19008,8 @@ case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3229,11 +3229,30 @@ v16i8x_info>, EVEX_V128; } } + +multiclass avx512_vpmadd opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst> { + defm NAME#Z : avx512_packs_rm, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm, EVEX_V128; + } +} + let Predicates = [HasBWI] in { defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; + + defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; } defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax, Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -288,6 +288,9 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -596,6 +596,18 @@ X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1038,3 +1038,36 @@ %res2 = add <32 x i16> %res, %res1 ret <32 x i16> %res2 } + +declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3843,3 +3843,70 @@ %res2 = add <16 x i16> %res, %res1 ret <16 x i16> %res2 } + +declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} Index: llvm/trunk/test/MC/X86/x86-64-avx512bw.s =================================================================== --- llvm/trunk/test/MC/X86/x86-64-avx512bw.s +++ llvm/trunk/test/MC/X86/x86-64-avx512bw.s @@ -3776,3 +3776,75 @@ // CHECK: encoding: [0x62,0xe2,0x25,0x40,0x0b,0xaa,0xc0,0xdf,0xff,0xff] vpmulhrsw -8256(%rdx), %zmm27, %zmm21 +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x02,0x5d,0x40,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 + +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} +// CHECK: encoding: [0x62,0x02,0x5d,0x43,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} + +// CHECK: vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z} +// CHECK: encoding: [0x62,0x02,0x5d,0xc3,0x04,0xd9] + vpmaddubsw %zmm25, %zmm20, %zmm27 {%k3} {z} + +// CHECK: vpmaddubsw (%rcx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x19] + vpmaddubsw (%rcx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x22,0x5d,0x40,0x04,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 8128(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x7f] + vpmaddubsw 8128(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw 8192(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0x00,0x20,0x00,0x00] + vpmaddubsw 8192(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw -8192(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x5a,0x80] + vpmaddubsw -8192(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddubsw -8256(%rdx), %zmm20, %zmm27 +// CHECK: encoding: [0x62,0x62,0x5d,0x40,0x04,0x9a,0xc0,0xdf,0xff,0xff] + vpmaddubsw -8256(%rdx), %zmm20, %zmm27 + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x01,0x4d,0x40,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} +// CHECK: encoding: [0x62,0x01,0x4d,0x42,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} + +// CHECK: vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z} +// CHECK: encoding: [0x62,0x01,0x4d,0xc2,0xf5,0xd1] + vpmaddwd %zmm25, %zmm22, %zmm26 {%k2} {z} + +// CHECK: vpmaddwd (%rcx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x11] + vpmaddwd (%rcx), %zmm22, %zmm26 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x21,0x4d,0x40,0xf5,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %zmm22, %zmm26 + +// CHECK: vpmaddwd 8128(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x7f] + vpmaddwd 8128(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd 8192(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0x00,0x20,0x00,0x00] + vpmaddwd 8192(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd -8192(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x52,0x80] + vpmaddwd -8192(%rdx), %zmm22, %zmm26 + +// CHECK: vpmaddwd -8256(%rdx), %zmm22, %zmm26 +// CHECK: encoding: [0x62,0x61,0x4d,0x40,0xf5,0x92,0xc0,0xdf,0xff,0xff] + vpmaddwd -8256(%rdx), %zmm22, %zmm26 + Index: llvm/trunk/test/MC/X86/x86-64-avx512bw_vl.s =================================================================== --- llvm/trunk/test/MC/X86/x86-64-avx512bw_vl.s +++ llvm/trunk/test/MC/X86/x86-64-avx512bw_vl.s @@ -6799,3 +6799,291 @@ // CHECK: encoding: [0x62,0x62,0x5d,0x20,0x0b,0xa2,0xe0,0xef,0xff,0xff] vpmulhrsw -4128(%rdx), %ymm20, %ymm28 +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 + +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} +// CHECK: encoding: [0x62,0x22,0x55,0x06,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} + +// CHECK: vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z} +// CHECK: encoding: [0x62,0x22,0x55,0x86,0x04,0xe4] + vpmaddubsw %xmm20, %xmm21, %xmm28 {%k6} {z} + +// CHECK: vpmaddubsw (%rcx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x21] + vpmaddubsw (%rcx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x22,0x55,0x00,0x04,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 2032(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x7f] + vpmaddubsw 2032(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw 2048(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0x00,0x08,0x00,0x00] + vpmaddubsw 2048(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw -2048(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0x62,0x80] + vpmaddubsw -2048(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw -2064(%rdx), %xmm21, %xmm28 +// CHECK: encoding: [0x62,0x62,0x55,0x00,0x04,0xa2,0xf0,0xf7,0xff,0xff] + vpmaddubsw -2064(%rdx), %xmm21, %xmm28 + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x02,0x2d,0x20,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} +// CHECK: encoding: [0x62,0x02,0x2d,0x25,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} + +// CHECK: vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z} +// CHECK: encoding: [0x62,0x02,0x2d,0xa5,0x04,0xf2] + vpmaddubsw %ymm26, %ymm26, %ymm30 {%k5} {z} + +// CHECK: vpmaddubsw (%rcx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x31] + vpmaddubsw (%rcx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x22,0x2d,0x20,0x04,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmaddubsw 291(%rax,%r14,8), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 4064(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x7f] + vpmaddubsw 4064(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw 4096(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0x00,0x10,0x00,0x00] + vpmaddubsw 4096(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw -4096(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0x72,0x80] + vpmaddubsw -4096(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddubsw -4128(%rdx), %ymm26, %ymm30 +// CHECK: encoding: [0x62,0x62,0x2d,0x20,0x04,0xb2,0xe0,0xef,0xff,0xff] + vpmaddubsw -4128(%rdx), %ymm26, %ymm30 + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 +// CHECK: encoding: [0x62,0x81,0x3d,0x00,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} +// CHECK: encoding: [0x62,0x81,0x3d,0x01,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} + +// CHECK: vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z} +// CHECK: encoding: [0x62,0x81,0x3d,0x81,0xf5,0xcc] + vpmaddwd %xmm28, %xmm24, %xmm17 {%k1} {z} + +// CHECK: vpmaddwd (%rcx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x09] + vpmaddwd (%rcx), %xmm24, %xmm17 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xa1,0x3d,0x00,0xf5,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %xmm24, %xmm17 + +// CHECK: vpmaddwd 2032(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x7f] + vpmaddwd 2032(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd 2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0x00,0x08,0x00,0x00] + vpmaddwd 2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd -2048(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x4a,0x80] + vpmaddwd -2048(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd -2064(%rdx), %xmm24, %xmm17 +// CHECK: encoding: [0x62,0xe1,0x3d,0x00,0xf5,0x8a,0xf0,0xf7,0xff,0xff] + vpmaddwd -2064(%rdx), %xmm24, %xmm17 + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} +// CHECK: encoding: [0x62,0x21,0x45,0x24,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} + +// CHECK: vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z} +// CHECK: encoding: [0x62,0x21,0x45,0xa4,0xf5,0xc3] + vpmaddwd %ymm19, %ymm23, %ymm24 {%k4} {z} + +// CHECK: vpmaddwd (%rcx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x01] + vpmaddwd (%rcx), %ymm23, %ymm24 + +// CHECK: vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x21,0x45,0x20,0xf5,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmaddwd 291(%rax,%r14,8), %ymm23, %ymm24 + +// CHECK: vpmaddwd 4064(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x7f] + vpmaddwd 4064(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd 4096(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0x00,0x10,0x00,0x00] + vpmaddwd 4096(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd -4096(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x42,0x80] + vpmaddwd -4096(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddwd -4128(%rdx), %ymm23, %ymm24 +// CHECK: encoding: [0x62,0x61,0x45,0x20,0xf5,0x82,0xe0,0xef,0xff,0xff] + vpmaddwd -4128(%rdx), %ymm23, %ymm24 + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 +// CHECK: encoding: [0x62,0x82,0x45,0x00,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} +// CHECK: encoding: [0x62,0x82,0x45,0x02,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} + +// CHECK: vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z} +// CHECK: encoding: [0x62,0x82,0x45,0x82,0x04,0xd9] + vpmaddubsw %xmm25, %xmm23, %xmm19 {%k2} {z} + +// CHECK: vpmaddubsw (%rcx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x19] + vpmaddubsw (%rcx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x45,0x00,0x04,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmaddubsw 4660(%rax,%r14,8), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 2032(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x7f] + vpmaddubsw 2032(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw 2048(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0x00,0x08,0x00,0x00] + vpmaddubsw 2048(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw -2048(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x5a,0x80] + vpmaddubsw -2048(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw -2064(%rdx), %xmm23, %xmm19 +// CHECK: encoding: [0x62,0xe2,0x45,0x00,0x04,0x9a,0xf0,0xf7,0xff,0xff] + vpmaddubsw -2064(%rdx), %xmm23, %xmm19 + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} +// CHECK: encoding: [0x62,0xa2,0x65,0x27,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} + +// CHECK: vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x65,0xa7,0x04,0xce] + vpmaddubsw %ymm22, %ymm19, %ymm17 {%k7} {z} + +// CHECK: vpmaddubsw (%rcx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x09] + vpmaddubsw (%rcx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xa2,0x65,0x20,0x04,0x8c,0xf0,0x34,0x12,0x00,0x00] + vpmaddubsw 4660(%rax,%r14,8), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4064(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x7f] + vpmaddubsw 4064(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw 4096(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0x00,0x10,0x00,0x00] + vpmaddubsw 4096(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw -4096(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x4a,0x80] + vpmaddubsw -4096(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddubsw -4128(%rdx), %ymm19, %ymm17 +// CHECK: encoding: [0x62,0xe2,0x65,0x20,0x04,0x8a,0xe0,0xef,0xff,0xff] + vpmaddubsw -4128(%rdx), %ymm19, %ymm17 + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} +// CHECK: encoding: [0x62,0xa1,0x4d,0x03,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} + +// CHECK: vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z} +// CHECK: encoding: [0x62,0xa1,0x4d,0x83,0xf5,0xfc] + vpmaddwd %xmm20, %xmm22, %xmm23 {%k3} {z} + +// CHECK: vpmaddwd (%rcx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x39] + vpmaddwd (%rcx), %xmm22, %xmm23 + +// CHECK: vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xa1,0x4d,0x00,0xf5,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmaddwd 4660(%rax,%r14,8), %xmm22, %xmm23 + +// CHECK: vpmaddwd 2032(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x7f] + vpmaddwd 2032(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd 2048(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0x00,0x08,0x00,0x00] + vpmaddwd 2048(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd -2048(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0x7a,0x80] + vpmaddwd -2048(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd -2064(%rdx), %xmm22, %xmm23 +// CHECK: encoding: [0x62,0xe1,0x4d,0x00,0xf5,0xba,0xf0,0xf7,0xff,0xff] + vpmaddwd -2064(%rdx), %xmm22, %xmm23 + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} +// CHECK: encoding: [0x62,0xa1,0x5d,0x22,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} + +// CHECK: vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z} +// CHECK: encoding: [0x62,0xa1,0x5d,0xa2,0xf5,0xd9] + vpmaddwd %ymm17, %ymm20, %ymm19 {%k2} {z} + +// CHECK: vpmaddwd (%rcx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x19] + vpmaddwd (%rcx), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xa1,0x5d,0x20,0xf5,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmaddwd 4660(%rax,%r14,8), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4064(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x7f] + vpmaddwd 4064(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd 4096(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0x00,0x10,0x00,0x00] + vpmaddwd 4096(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd -4096(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x5a,0x80] + vpmaddwd -4096(%rdx), %ymm20, %ymm19 + +// CHECK: vpmaddwd -4128(%rdx), %ymm20, %ymm19 +// CHECK: encoding: [0x62,0xe1,0x5d,0x20,0xf5,0x9a,0xe0,0xef,0xff,0xff] + vpmaddwd -4128(%rdx), %ymm20, %ymm19 +