Index: lib/Support/Host.cpp =================================================================== --- lib/Support/Host.cpp +++ lib/Support/Host.cpp @@ -1390,6 +1390,7 @@ Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1); // AVX512 is only supported if the OS supports the context save for it. + Features["avx512vpopcntdq"] = HasLeaf7 && ((EBX >> 14) & 1) && HasAVX512Save; Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save; Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save; Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save; Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -127,6 +127,9 @@ def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", "Enable AVX-512 Conflict Detection Instructions", [FeatureAVX512]>; +def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", + "true", "Enable AVX-512 Population Count Instructions", + [FeatureAVX512]>; def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1373,6 +1373,14 @@ setOperationAction(ISD::MUL, MVT::v8i64, Legal); } + if (Subtarget.hasVPOPCNTDQ()) { + // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512 + // version of popcntd/q. + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64, + MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::CTPOP, VT, Legal); + } + // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -8679,6 +8679,42 @@ } //===---------------------------------------------------------------------===// +// Counts number of ones - VPOPCNTD and VPOPCNTQ +//===---------------------------------------------------------------------===// + +multiclass avx512_unary_rmb_popcnt opc, string OpcodeStr, X86VectorVTInfo VTInfo> { + let Predicates = [HasVPOPCNTDQ] in + defm Z : avx512_unary_rmb, EVEX_V512; +} + +// Use 512bit version to implement 128/256 bit. +multiclass avx512_unary_lowering { + let Predicates = [prd] in { + def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + (EXTRACT_SUBREG + (!cast(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info256.RC:$src1, + _.info256.SubRegIdx)), + _.info256.SubRegIdx)>; + + def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + (EXTRACT_SUBREG + (!cast(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info128.RC:$src1, + _.info128.SubRegIdx)), + _.info128.SubRegIdx)>; + } +} + +defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>, + avx512_unary_lowering; +defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>, + avx512_unary_lowering, VEX_W; + +//===---------------------------------------------------------------------===// // Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// multiclass avx512_replicate opc, string OpcodeStr, SDNode OpNode>{ Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -877,6 +877,8 @@ { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, // AVX-512 foldable instructions + { X86::VPOPCNTDZrr, X86::VPOPCNTDZrm, 0 }, + { X86::VPOPCNTQZrr, X86::VPOPCNTQZrm, 0 }, { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, @@ -2304,6 +2306,8 @@ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, // AVX-512 masked foldable instructions + { X86::VPOPCNTDZrrkz, X86::VPOPCNTDZrmkz, 0 }, + { X86::VPOPCNTQZrrkz, X86::VPOPCNTQZrmkz, 0 }, { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, @@ -2925,6 +2929,8 @@ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, // AVX-512 masked foldable instructions + { X86::VPOPCNTDZrrk, X86::VPOPCNTDZrmk, 0 }, + { X86::VPOPCNTQZrrk, X86::VPOPCNTQZrmk, 0 }, { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 }, Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -813,6 +813,8 @@ def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">, AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">, + AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">; def HasPFI : Predicate<"Subtarget->hasPFI()">, AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; def HasERI : Predicate<"Subtarget->hasERI()">, Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -265,6 +265,9 @@ /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI; + /// Processor has AVX-512 population count Instructions + bool HasVPOPCNTDQ; + /// Processor has AVX-512 Doubleword and Quadword instructions bool HasDQI; @@ -492,6 +495,7 @@ bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } + bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } bool hasDQI() const { return HasDQI; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -286,6 +286,7 @@ HasCDI = false; HasPFI = false; HasDQI = false; + HasVPOPCNTDQ = false; HasBWI = false; HasVLX = false; HasADX = false; Index: test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll +++ test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The following tests check that patterns that includes ;; +;; ctpop intrinsic + select are translated to the vpopcntd/q ;; +;; instruction in a correct way. ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) { +; CHECK-LABEL: test_mask_vpopcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpopcntd %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a + ret <16 x i32> %3 +} + +define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) { +; CHECK-LABEL: test_maskz_vpopcnt_d: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_vpopcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpopcntq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 +} + +define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vpopcnt_q: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: @@ -81,19 +82,41 @@ ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out } @@ -193,23 +216,49 @@ ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) ret <4 x i32> %out } Index: test/CodeGen/X86/vector-popcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-256.ll +++ test/CodeGen/X86/vector-popcnt-256.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -39,6 +40,13 @@ ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out } @@ -92,6 +100,13 @@ ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out } @@ -137,6 +152,21 @@ ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out } @@ -173,6 +203,18 @@ ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } Index: test/CodeGen/X86/vector-popcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-512.ll +++ test/CodeGen/X86/vector-popcnt-512.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-LABEL: testv8i64: @@ -39,6 +40,11 @@ ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in) ret <8 x i64> %out } @@ -92,6 +98,11 @@ ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in) ret <16 x i32> %out } @@ -135,6 +146,30 @@ ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in) ret <32 x i16> %out } @@ -169,6 +204,24 @@ ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in) ret <64 x i8> %out } Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -7,6 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 @@ -117,6 +118,17 @@ ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -284,6 +296,17 @@ ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -501,6 +524,18 @@ ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -700,6 +735,18 @@ ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -843,6 +890,25 @@ ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -984,6 +1050,25 @@ ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1106,6 +1191,22 @@ ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1224,6 +1325,22 @@ ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1258,6 +1375,12 @@ ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1280,6 +1403,12 @@ ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1300,6 +1429,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1319,6 +1453,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1338,6 +1477,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1357,6 +1501,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1376,6 +1525,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] @@ -1395,6 +1549,11 @@ ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] Index: test/CodeGen/X86/vector-tzcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-256.ll +++ test/CodeGen/X86/vector-tzcnt-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 @@ -92,6 +93,17 @@ ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -182,6 +194,17 @@ ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -307,6 +330,17 @@ ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -414,6 +448,17 @@ ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -532,6 +577,25 @@ ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -647,6 +711,25 @@ ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -747,6 +830,22 @@ ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -844,6 +943,22 @@ ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 Index: test/CodeGen/X86/vector-tzcnt-512.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-512.ll +++ test/CodeGen/X86/vector-tzcnt-512.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64: @@ -64,6 +65,15 @@ ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) ret <8 x i64> %out } @@ -105,6 +115,15 @@ ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) ret <8 x i64> %out } @@ -186,6 +205,15 @@ ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0) ret <16 x i32> %out } @@ -231,6 +259,15 @@ ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1) ret <16 x i32> %out } @@ -305,6 +342,38 @@ ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out } @@ -379,6 +448,38 @@ ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out } @@ -441,6 +542,32 @@ ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out } @@ -503,6 +630,32 @@ ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out } Index: test/MC/Disassembler/X86/avx-512.txt =================================================================== --- test/MC/Disassembler/X86/avx-512.txt +++ test/MC/Disassembler/X86/avx-512.txt @@ -1,5 +1,6 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=skx | FileCheck --check-prefix=CHECK-SKX %s +# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ # CHECK: vpbroadcastd %xmm18, %zmm28 {%k7} {z} 0x62 0x22 0x7d 0xcf 0x58 0xe2 @@ -265,3 +266,25 @@ # CHECK: vscatterqpd %ymm19, 256(%r9,%ymm31) {%k1} 0x62 0x82 0xfd 0x21 0xa3 0x5c 0x39 0x20 + +##################################################### +# POPULATION COUNT # +##################################################### + +# AVX512VPOPCNTDQ: vpopcntd %zmm21, %zmm26 {%k4} +0x62 0x22 0x7d 0x4c 0x55 0xd5 + +# AVX512VPOPCNTDQ: vpopcntd %zmm21, %zmm26 {%k4} {z} +0x62 0x22 0x7d 0xcc 0x55 0xd5 + +# AVX512VPOPCNTDQ: vpopcntd (%rcx), %zmm26 +0x62 0x62 0x7d 0x48 0x55 0x11 + +# AVX512VPOPCNTDQ: vpopcntq %zmm21, %zmm17 {%k6} +0x62 0xa2 0xfd 0x4e 0x55 0xcd + +# AVX512VPOPCNTDQ: vpopcntq %zmm21, %zmm17 {%k6} {z} +0x62 0xa2 0xfd 0xce 0x55 0xcd + +# AVX512VPOPCNTDQ: vpopcntq (%rcx), %zmm17 +0x62 0xe2 0xfd 0x48 0x55 0x09 Index: test/MC/X86/x86-64-avx512vpopcntdq.s =================================================================== --- test/MC/X86/x86-64-avx512vpopcntdq.s +++ test/MC/X86/x86-64-avx512vpopcntdq.s @@ -0,0 +1,225 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-encoding %s | FileCheck %s + +// CHECK: vpopcntq %zmm25, %zmm20 +// CHECK: encoding: [0x62,0x82,0xfd,0x48,0x55,0xe1] + vpopcntq %zmm25, %zmm20 + +// CHECK: vpopcntq %zmm25, %zmm20 {%k6} +// CHECK: encoding: [0x62,0x82,0xfd,0x4e,0x55,0xe1] + vpopcntq %zmm25, %zmm20 {%k6} + +// CHECK: vpopcntq %zmm25, %zmm20 {%k6} {z} +// CHECK: encoding: [0x62,0x82,0xfd,0xce,0x55,0xe1] + vpopcntq %zmm25, %zmm20 {%k6} {z} + +// CHECK: vpopcntq (%rcx), %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x21] + vpopcntq (%rcx), %zmm20 + +// CHECK: vpopcntq 291(%rax,%r14,8), %zmm20 +// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpopcntq 291(%rax,%r14,8), %zmm20 + +// CHECK: vpopcntq (%rcx){1to8}, %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x21] + vpopcntq (%rcx){1to8}, %zmm20 + +// CHECK: vpopcntq 4064(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0x0f,0x00,0x00] + vpopcntq 4064(%rdx), %zmm20 + +// CHECK: vpopcntq 4096(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0x40] + vpopcntq 4096(%rdx), %zmm20 + +// CHECK: vpopcntq -4096(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0xc0] + vpopcntq -4096(%rdx), %zmm20 + +// CHECK: vpopcntq -4128(%rdx), %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0xef,0xff,0xff] + vpopcntq -4128(%rdx), %zmm20 + +// CHECK: vpopcntq 1016(%rdx){1to8}, %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x7f] + vpopcntq 1016(%rdx){1to8}, %zmm20 + +// CHECK: vpopcntq 1024(%rdx){1to8}, %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0x00,0x04,0x00,0x00] + vpopcntq 1024(%rdx){1to8}, %zmm20 + +// CHECK: vpopcntq -1024(%rdx){1to8}, %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x80] + vpopcntq -1024(%rdx){1to8}, %zmm20 + +// CHECK: vpopcntq -1032(%rdx){1to8}, %zmm20 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0xf8,0xfb,0xff,0xff] + vpopcntq -1032(%rdx){1to8}, %zmm20 + +// CHECK: vpopcntq %zmm21, %zmm17 +// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xcd] + vpopcntq %zmm21, %zmm17 + +// CHECK: vpopcntq %zmm21, %zmm17 {%k6} +// CHECK: encoding: [0x62,0xa2,0xfd,0x4e,0x55,0xcd] + vpopcntq %zmm21, %zmm17 {%k6} + +// CHECK: vpopcntq %zmm21, %zmm17 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0xfd,0xce,0x55,0xcd] + vpopcntq %zmm21, %zmm17 {%k6} {z} + +// CHECK: vpopcntq (%rcx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x09] + vpopcntq (%rcx), %zmm17 + +// CHECK: vpopcntq 4660(%rax,%r14,8), %zmm17 +// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0x8c,0xf0,0x34,0x12,0x00,0x00] + vpopcntq 4660(%rax,%r14,8), %zmm17 + +// CHECK: vpopcntq (%rcx){1to8}, %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x09] + vpopcntq (%rcx){1to8}, %zmm17 + +// CHECK: vpopcntq 4064(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00] + vpopcntq 4064(%rdx), %zmm17 + +// CHECK: vpopcntq 4096(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0x40] + vpopcntq 4096(%rdx), %zmm17 + +// CHECK: vpopcntq -4096(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0xc0] + vpopcntq -4096(%rdx), %zmm17 + +// CHECK: vpopcntq -4128(%rdx), %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff] + vpopcntq -4128(%rdx), %zmm17 + +// CHECK: vpopcntq 1016(%rdx){1to8}, %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x7f] + vpopcntq 1016(%rdx){1to8}, %zmm17 + +// CHECK: vpopcntq 1024(%rdx){1to8}, %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0x00,0x04,0x00,0x00] + vpopcntq 1024(%rdx){1to8}, %zmm17 + +// CHECK: vpopcntq -1024(%rdx){1to8}, %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x80] + vpopcntq -1024(%rdx){1to8}, %zmm17 + +// CHECK: vpopcntq -1032(%rdx){1to8}, %zmm17 +// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0xf8,0xfb,0xff,0xff] + vpopcntq -1032(%rdx){1to8}, %zmm17 + +// CHECK: vpopcntd %zmm19, %zmm25 +// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xcb] + vpopcntd %zmm19, %zmm25 + +// CHECK: vpopcntd %zmm19, %zmm25 {%k4} +// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xcb] + vpopcntd %zmm19, %zmm25 {%k4} + +// CHECK: vpopcntd %zmm19, %zmm25 {%k4} {z} +// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xcb] + vpopcntd %zmm19, %zmm25 {%k4} {z} + +// CHECK: vpopcntd (%rcx), %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x09] + vpopcntd (%rcx), %zmm25 + +// CHECK: vpopcntd 291(%rax,%r14,8), %zmm25 +// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpopcntd 291(%rax,%r14,8), %zmm25 + +// CHECK: vpopcntd (%rcx){1to16}, %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x09] + vpopcntd (%rcx){1to16}, %zmm25 + +// CHECK: vpopcntd 4064(%rdx), %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00] + vpopcntd 4064(%rdx), %zmm25 + +// CHECK: vpopcntd 4096(%rdx), %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0x40] + vpopcntd 4096(%rdx), %zmm25 + +// CHECK: vpopcntd -4096(%rdx), %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0xc0] + vpopcntd -4096(%rdx), %zmm25 + +// CHECK: vpopcntd -4128(%rdx), %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff] + vpopcntd -4128(%rdx), %zmm25 + +// CHECK: vpopcntd 508(%rdx){1to16}, %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x7f] + vpopcntd 508(%rdx){1to16}, %zmm25 + +// CHECK: vpopcntd 512(%rdx){1to16}, %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0x00,0x02,0x00,0x00] + vpopcntd 512(%rdx){1to16}, %zmm25 + +// CHECK: vpopcntd -512(%rdx){1to16}, %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x80] + vpopcntd -512(%rdx){1to16}, %zmm25 + +// CHECK: vpopcntd -516(%rdx){1to16}, %zmm25 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0xfc,0xfd,0xff,0xff] + vpopcntd -516(%rdx){1to16}, %zmm25 + +// CHECK: vpopcntd %zmm21, %zmm26 +// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xd5] + vpopcntd %zmm21, %zmm26 + +// CHECK: vpopcntd %zmm21, %zmm26 {%k4} +// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xd5] + vpopcntd %zmm21, %zmm26 {%k4} + +// CHECK: vpopcntd %zmm21, %zmm26 {%k4} {z} +// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xd5] + vpopcntd %zmm21, %zmm26 {%k4} {z} + +// CHECK: vpopcntd (%rcx), %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x11] + vpopcntd (%rcx), %zmm26 + +// CHECK: vpopcntd 4660(%rax,%r14,8), %zmm26 +// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x94,0xf0,0x34,0x12,0x00,0x00] + vpopcntd 4660(%rax,%r14,8), %zmm26 + +// CHECK: vpopcntd (%rcx){1to16}, %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x11] + vpopcntd (%rcx){1to16}, %zmm26 + +// CHECK: vpopcntd 4064(%rdx), %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0x0f,0x00,0x00] + vpopcntd 4064(%rdx), %zmm26 + +// CHECK: vpopcntd 4096(%rdx), %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0x40] + vpopcntd 4096(%rdx), %zmm26 + +// CHECK: vpopcntd -4096(%rdx), %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0xc0] + vpopcntd -4096(%rdx), %zmm26 + +// CHECK: vpopcntd -4128(%rdx), %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0xef,0xff,0xff] + vpopcntd -4128(%rdx), %zmm26 + +// CHECK: vpopcntd 508(%rdx){1to16}, %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x7f] + vpopcntd 508(%rdx){1to16}, %zmm26 + +// CHECK: vpopcntd 512(%rdx){1to16}, %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0x00,0x02,0x00,0x00] + vpopcntd 512(%rdx){1to16}, %zmm26 + +// CHECK: vpopcntd -512(%rdx){1to16}, %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x80] + vpopcntd -512(%rdx){1to16}, %zmm26 + +// CHECK: vpopcntd -516(%rdx){1to16}, %zmm26 +// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0xfc,0xfd,0xff,0xff] + vpopcntd -516(%rdx){1to16}, %zmm26