diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9496,16 +9496,14 @@ // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { + assert(Subtarget.hasAVX() && "Not expecting to get here without AVX."); EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. - // For size optimization, also splat v2f64 and v2i64, and for size opt - // with AVX2, also splat i8 and i16. - // With pattern matching, the VBROADCAST node may become a VMOVDDUP. - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || - (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || - (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { + if (ScalarSize == 32 || + (ScalarSize == 64 && (IsGE256 || Subtarget.hasAVX2() || OptForSize)) || + ((ScalarSize == 8 || ScalarSize == 16) && Subtarget.hasAVX2()) || + (CVT.isFloatingPoint() && ScalarSize == 16 && Subtarget.hasFP16())) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2756,7 +2756,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42] +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2764,7 +2765,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42] +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %i = and <8 x i32> %a, diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -148,7 +148,8 @@ ; X32-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X32-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X32-NEXT: vzeroupper @@ -159,7 +160,8 @@ ; X64-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper @@ -174,7 +176,7 @@ ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X32-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -188,7 +190,7 @@ ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1507,18 +1507,18 @@ ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,4] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,4] +; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -424,13 +424,15 @@ ; X32-LABEL: shl9: ; X32: # %bb.0: ; X32-NEXT: vpsllw $3, %ymm0, %ymm0 -; X32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: shl9: ; X64: # %bb.0: ; X64-NEXT: vpsllw $3, %ymm0, %ymm0 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %B = shl <32 x i8> %A, ret <32 x i8> %B @@ -440,13 +442,15 @@ ; X32-LABEL: shr9: ; X32: # %bb.0: ; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: shr9: ; X64: # %bb.0: ; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %B = lshr <32 x i8> %A, ret <32 x i8> %B @@ -472,8 +476,9 @@ ; X32-LABEL: sra_v32i8: ; X32: # %bb.0: ; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl @@ -481,8 +486,9 @@ ; X64-LABEL: sra_v32i8: ; X64: # %bb.0: ; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -657,12 +657,12 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e4: ; X32: ## %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X32-NEXT: vpbroadcastb {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] ; X32-NEXT: retl ; ; X64-LABEL: _e4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X64-NEXT: vpbroadcastb {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] ; X64-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -506,12 +506,14 @@ define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { ; X86-LABEL: shl_32i8: ; X86: # %bb.0: -; X86-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-NEXT: vpsllw $4, %ymm0, %ymm2 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpsllw $2, %ymm0, %ymm2 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -521,12 +523,14 @@ ; ; X64-LABEL: shl_32i8: ; X64: # %bb.0: -; X64-NEXT: vpsllw $5, %ymm1, %ymm1 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpsllw $5, %ymm1, %ymm1 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -712,32 +716,38 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { ; X86-LABEL: lshr_32i8: ; X86: # %bb.0: -; X86-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-NEXT: vpsrlw $4, %ymm0, %ymm2 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpsrlw $2, %ymm0, %ymm2 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpsrlw $1, %ymm0, %ymm2 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: lshr_32i8: ; X64: # %bb.0: -; X64-NEXT: vpsllw $5, %ymm1, %ymm1 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpsllw $5, %ymm1, %ymm1 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; X64-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll --- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll @@ -51,7 +51,8 @@ ; KNL-LABEL: any_extend_load_v8i16: ; KNL: # %bb.0: ; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; KNL-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; KNL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq @@ -59,7 +60,8 @@ ; SKX-LABEL: any_extend_load_v8i16: ; SKX: # %bb.0: ; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SKX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpmovwb %xmm0, (%rdi) ; SKX-NEXT: retq %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -293,7 +293,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512F-LABEL: imulq128_bcast: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -303,7 +303,7 @@ ; ; AVX512VL-LABEL: imulq128_bcast: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -313,7 +313,7 @@ ; ; AVX512BW-LABEL: imulq128_bcast: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -324,15 +324,14 @@ ; AVX512DQ-LABEL: imulq128_bcast: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: imulq128_bcast: ; SKX: # %bb.0: -; SKX-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; SKX-NEXT: retq %z = mul <2 x i64> %x, ret <2 x i64>%z diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -177,7 +177,7 @@ ; CHECK-LABEL: bcast_unfold_add_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB5_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -584,7 +584,8 @@ ; CHECK-LABEL: bcast_unfold_or_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB17_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -786,7 +787,8 @@ ; CHECK-LABEL: bcast_unfold_fneg_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB23_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1003,7 +1005,8 @@ ; CHECK-LABEL: bcast_unfold_fabs_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [NaN,NaN] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB29_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1208,7 +1211,8 @@ ; CHECK-LABEL: bcast_unfold_fadd_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB35_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1410,7 +1414,8 @@ ; CHECK-LABEL: bcast_unfold_fmul_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [3.0E+0,3.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB41_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1617,7 +1622,8 @@ ; CHECK-LABEL: bcast_unfold_fdiv_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB47_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1865,7 +1871,8 @@ ; CHECK-LABEL: bcast_unfold_fma213_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB54_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1900,7 +1907,8 @@ ; CHECK-LABEL: bcast_unfold_fma231_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB55_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2186,7 +2194,8 @@ ; CHECK-LABEL: bcast_unfold_fmax_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB63_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2400,7 +2409,8 @@ ; CHECK-LABEL: bcast_unfold_fmin_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB69_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2611,7 +2621,7 @@ ; CHECK-LABEL: bcast_unfold_smin_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB75_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2819,7 +2829,7 @@ ; CHECK-LABEL: bcast_unfold_smax_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB81_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3027,7 +3037,7 @@ ; CHECK-LABEL: bcast_unfold_umin_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB87_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3235,7 +3245,7 @@ ; CHECK-LABEL: bcast_unfold_umax_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB93_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -3449,13 +3459,13 @@ ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB99_1 @@ -3668,13 +3678,13 @@ ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} ; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB105_1 @@ -3890,13 +3900,13 @@ ; CHECK-LABEL: bcast_unfold_pcmp_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB111_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -4115,13 +4125,13 @@ ; CHECK-LABEL: bcast_unfold_pcmpu_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB117_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; CHECK-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} ; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF @@ -4340,8 +4350,10 @@ ; CHECK-LABEL: bcast_unfold_cmp_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] -; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [3.0E+0,3.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0] +; CHECK-NEXT: # xmm1 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB123_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -2907,13 +2907,15 @@ ; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_64xi1_to_64xi8: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} ; SKX-NEXT: retq ; ; AVX512DQNOBW-LABEL: zext_64xi1_to_64xi8: @@ -2923,7 +2925,9 @@ ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQNOBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> @@ -2938,7 +2942,9 @@ ; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi16: @@ -2955,7 +2961,9 @@ ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQNOBW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQNOBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> @@ -2986,13 +2994,14 @@ ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi8: ; SKX: # %bb.0: ; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} +; SKX-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} ; SKX-NEXT: retq ; ; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi8: @@ -3006,7 +3015,8 @@ ; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512DQNOBW-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQNOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i8> @@ -3045,7 +3055,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_2xi1_to_2xi64: @@ -3059,7 +3070,7 @@ ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQNOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -581,7 +581,8 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { ; KNL-LABEL: usat_trunc_wb_256_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, (%rdi) ; KNL-NEXT: vzeroupper @@ -602,7 +603,8 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { ; KNL-LABEL: usat_trunc_wb_256: ; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vzeroupper @@ -622,7 +624,8 @@ define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { ; KNL-LABEL: usat_trunc_wb_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; KNL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq @@ -731,7 +734,8 @@ define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) { ; KNL-LABEL: usat_trunc_wb_128: ; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; KNL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; KNL-NEXT: retq ; @@ -1008,8 +1012,10 @@ define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { ; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531] +; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526] +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, (%rdi) ; KNL-NEXT: vzeroupper @@ -1017,8 +1023,10 @@ ; ; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: ; SKX: ## %bb.0: -; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531] +; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526] +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; SKX-NEXT: vpmovwb %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1034,8 +1042,10 @@ define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { ; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526] +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531] +; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, (%rdi) ; KNL-NEXT: vzeroupper @@ -1043,8 +1053,10 @@ ; ; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: ; SKX: ## %bb.0: -; SKX-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526,65526] +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531,65531] +; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; SKX-NEXT: vpmovwb %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1192,8 +1192,10 @@ ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] ; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512-NEXT: ## encoding: [0xc4,0xe2,0x79,0x59,0x0d,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test45: @@ -1213,8 +1215,11 @@ ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00] ; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] -; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A] +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [1,1] +; AVX512-NEXT: ## encoding: [0xc5,0xfb,0x12,0x0d,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512-NEXT: ## xmm1 = mem[0,0] +; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0xc1] ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test46: @@ -1457,8 +1462,10 @@ ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] -; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; KNL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; KNL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x78,0x0d,A,A,A,A] +; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc1] ; KNL-NEXT: vpextrw $0, %xmm0, (%rsi) ## encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; KNL-NEXT: retq ## encoding: [0xc3] @@ -1487,8 +1494,10 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ## encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0] ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x78,0x0d,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] ; AVX512BW-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512BW-NEXT: retq ## encoding: [0xc3] @@ -1518,8 +1527,10 @@ ; SKX-NEXT: korw %k0, %k1, %k0 ## encoding: [0xc5,0xf4,0x45,0xc0] ; SKX-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0] ; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc0] -; SKX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; SKX-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ## EVEX TO VEX Compression xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: ## encoding: [0xc4,0xe2,0x79,0x78,0x0d,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; SKX-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00] ; SKX-NEXT: retq ## encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -370,7 +370,7 @@ define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fcopysignv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %a diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll @@ -739,8 +739,10 @@ define <2 x half> @test_u1tofp2(<2 x i1> %arg0) { ; CHECK-LABEL: test_u1tofp2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovqw %xmm0, %xmm0 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 ; CHECK-NEXT: retq %res = uitofp <2 x i1> %arg0 to <2 x half> @@ -761,7 +763,7 @@ define <2 x half> @test_u33tofp2(<2 x i33> %arg0) { ; CHECK-LABEL: test_u33tofp2: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: vcvtuqq2ph %xmm0, %xmm0 ; CHECK-NEXT: retq %res = uitofp <2 x i33> %arg0 to <2 x half> diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll --- a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll @@ -357,7 +357,7 @@ define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fadd_bitcast_fneg_vec_width: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %y to <2 x i64> @@ -370,7 +370,7 @@ define <8 x half> @fsub_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fsub_bitcast_fneg_vec_width: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <8 x half> %y to <2 x i64> diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -168,7 +168,7 @@ define <2 x i64> @vpandq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { ; CHECK-LABEL: vpandq128: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq entry: @@ -181,7 +181,7 @@ define <2 x i64> @vpandnq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { ; CHECK-LABEL: vpandnq128: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq entry: @@ -195,7 +195,7 @@ define <2 x i64> @vporq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { ; CHECK-LABEL: vporq128: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq entry: @@ -208,7 +208,7 @@ define <2 x i64> @vpxorq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { ; CHECK-LABEL: vpxorq128: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -215,7 +215,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: ext_i16_16i8: @@ -229,7 +230,7 @@ ; AVX512VLBW-LABEL: ext_i16_16i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i16 %a0 to <16 x i1> %2 = zext <16 x i1> %1 to <16 x i8> @@ -475,7 +476,8 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: ext_i32_32i8: @@ -494,7 +496,7 @@ ; AVX512VLBW-LABEL: ext_i32_32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovd %edi, %k1 -; AVX512VLBW-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i32 %a0 to <32 x i1> %2 = zext <32 x i1> %1 to <32 x i8> @@ -843,7 +845,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -880,7 +882,7 @@ ; AVX512VLBW-LABEL: ext_i64_64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: kmovq %rdi, %k1 -; AVX512VLBW-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} +; AVX512VLBW-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z} ; AVX512VLBW-NEXT: retq %1 = bitcast i64 %a0 to <64 x i1> %2 = zext <64 x i1> %1 to <64 x i8> diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -176,7 +176,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: bitcast_i16_16i1: @@ -226,7 +227,8 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: bitcast_i32_32i1: diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -515,7 +515,8 @@ ; AVX2-LABEL: v16i8_widened_with_ones: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %ecx ; AVX2-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -177,10 +177,16 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper8xi16a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper8xi16a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper8xi16a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %0, i32 0 %x1 = extractelement <8 x i16> %0, i32 1 %x2 = extractelement <8 x i16> %0, i32 2 @@ -224,10 +230,16 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi16a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi16a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi16a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <16 x i16> %0, i32 0 %x1 = extractelement <16 x i16> %0, i32 1 %x2 = extractelement <16 x i16> %0, i32 2 @@ -301,10 +313,16 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi8a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi8a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi8a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <16 x i8> %0, i32 0 %x1 = extractelement <16 x i8> %0, i32 1 %x2 = extractelement <16 x i8> %0, i32 2 @@ -380,10 +398,16 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper32xi8a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper32xi8a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper32xi8a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <32 x i8> %0, i32 0 %x1 = extractelement <32 x i8> %0, i32 1 %x2 = extractelement <32 x i8> %0, i32 2 @@ -1341,10 +1365,16 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi8c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %r = and <16 x i8> , %0 ret <16 x i8> %r } @@ -1357,10 +1387,16 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper32xi8c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper32xi8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper32xi8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq %r = and <32 x i8> , %0 ret <32 x i8> %r } diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll --- a/llvm/test/CodeGen/X86/combine-abs.ll +++ b/llvm/test/CodeGen/X86/combine-abs.ll @@ -168,7 +168,8 @@ ; ; AVX-LABEL: combine_v16i8_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = insertelement <16 x i8> undef, i8 15, i32 0 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll --- a/llvm/test/CodeGen/X86/combine-add-ssat.ll +++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll @@ -93,11 +93,6 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq -; -; AVX-LABEL: combine_constant_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpaddsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> %a0) ret <8 x i16> %res } diff --git a/llvm/test/CodeGen/X86/combine-add-usat.ll b/llvm/test/CodeGen/X86/combine-add-usat.ll --- a/llvm/test/CodeGen/X86/combine-add-usat.ll +++ b/llvm/test/CodeGen/X86/combine-add-usat.ll @@ -90,11 +90,6 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq -; -; AVX-LABEL: combine_constant_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq %1 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> %a0) ret <8 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/combine-movmsk.ll b/llvm/test/CodeGen/X86/combine-movmsk.ll --- a/llvm/test/CodeGen/X86/combine-movmsk.ll +++ b/llvm/test/CodeGen/X86/combine-movmsk.ll @@ -237,12 +237,6 @@ ; SSE42-NEXT: movmskpd %xmm0, %eax ; SSE42-NEXT: xorl $3, %eax ; SSE42-NEXT: retq -; -; AVX-LABEL: movmskpd_pow2_mask: -; AVX: # %bb.0: -; AVX-NEXT: vmovmskpd %xmm0, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: retq %1 = and <2 x i64> %a0, %2 = icmp eq <2 x i64> %1, zeroinitializer %3 = sext <2 x i1> %2 to <2 x i64> @@ -273,13 +267,6 @@ ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: retq -; -; AVX-LABEL: pmovmskb_pow2_mask: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX-NEXT: vpmovmskb %xmm0, %eax -; AVX-NEXT: xorl $65535, %eax # imm = 0xFFFF -; AVX-NEXT: retq %1 = and <16 x i8> %a0, %2 = icmp eq <16 x i8> %1, zeroinitializer %3 = sext <16 x i1> %2 to <16 x i8> diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -307,11 +307,13 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -345,7 +347,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,1] +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 @@ -451,7 +454,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -33,18 +33,6 @@ ; SSE-NEXT: pavgw %xmm2, %xmm3 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: retq -; -; AVX-LABEL: combine_pavgw_knownbits: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpavgw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq %m0 = and <8 x i16> %a0, %m1 = and <8 x i16> %a1, %m2 = and <8 x i16> %a2, diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -355,7 +355,8 @@ ; ; AVX-LABEL: combine_vec_shl_zext_lshr0: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65520,65520,65520,65520,65520,65520,65520,65520] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: retq %1 = lshr <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll --- a/llvm/test/CodeGen/X86/combine-smax.ll +++ b/llvm/test/CodeGen/X86/combine-smax.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: test_v16i8_nosignbit: @@ -32,13 +32,21 @@ ; SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_nosignbit: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_nosignbit: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp sgt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll --- a/llvm/test/CodeGen/X86/combine-smin.ll +++ b/llvm/test/CodeGen/X86/combine-smin.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: test_v16i8_nosignbit: @@ -32,13 +32,21 @@ ; SSE42-NEXT: pminsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_nosignbit: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_nosignbit: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp slt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -474,14 +474,24 @@ ; SSE-NEXT: psrlw $4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_uniform: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_uniform: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_uniform: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645] +; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_uniform: ; XOP: # %bb.0: @@ -658,17 +668,30 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_nonuniform4: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_nonuniform4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_nonuniform4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_nonuniform4: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -373,7 +373,7 @@ ; AVX2-LABEL: mismatch_tofp_v4i32_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -65,7 +65,8 @@ ; AVXVNNI-LABEL: mul_4xi4_cz: ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -76,7 +77,8 @@ ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry ; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -89,7 +91,8 @@ ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -30,7 +30,8 @@ define i32 @mul_i4i8(<16 x i4> %a, <16 x i8> %b, i32 %c) { ; CHECK-LABEL: mul_i4i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] @@ -52,13 +53,15 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_i4i4: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; CHECK-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] @@ -109,7 +112,7 @@ define i32 @mul_zext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll --- a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -74,7 +74,9 @@ define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) { ; CHECK-LABEL: ExeDepsFix_broadcastsd: ; CHECK: ## %bb.0: -; CHECK-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [2147483647,2147483647] +; CHECK-NEXT: ## xmm2 = mem[0,0] +; CHECK-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bitcast = bitcast <2 x double> %arg to <2 x i64> diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -317,23 +317,16 @@ ; This used to crash by creating a setcc with an i64 condition on a 32-bit target. define <3 x double> @extvselectsetcc_crash(<2 x double> %x) { -; X64-LABEL: extvselectsetcc_crash: -; X64: # %bb.0: -; X64-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X64-NEXT: vandpd %xmm2, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; X64-NEXT: retq -; -; X86-LABEL: extvselectsetcc_crash: -; X86: # %bb.0: -; X86-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vandpd %xmm2, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; X86-NEXT: retl +; CHECK-LABEL: extvselectsetcc_crash: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [5.0E+0,5.0E+0] +; CHECK-NEXT: # xmm1 = mem[0,0] +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; CHECK-NEXT: ret{{[l|q]}} %cmp = fcmp oeq <2 x double> %x, %s = select <2 x i1> %cmp, <2 x double> , <2 x double> %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> @@ -546,7 +539,9 @@ define double @fabs_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fabs_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovddup {{.*#+}} xmm1 = [NaN,NaN] +; X64-NEXT: # xmm1 = mem[0,0] +; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -556,7 +551,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovddup {{.*#+}} xmm1 = [NaN,NaN] +; X86-NEXT: # xmm1 = mem[0,0] +; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlps %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -818,8 +815,12 @@ define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: copysign_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X64-NEXT: # xmm2 = mem[0,0] +; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm2 = [NaN,NaN] +; X64-NEXT: # xmm2 = mem[0,0] +; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -830,8 +831,12 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X86-NEXT: # xmm2 = mem[0,0] +; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-NEXT: vmovddup {{.*#+}} xmm2 = [NaN,NaN] +; X86-NEXT: # xmm2 = mem[0,0] +; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlps %xmm0, (%esp) ; X86-NEXT: fldl (%esp) @@ -1096,7 +1101,9 @@ define double @round_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: round_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X64-NEXT: # xmm1 = mem[0,0] +; X64-NEXT: vandpd %xmm1, %xmm0, %xmm1 ; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] ; X64-NEXT: # xmm2 = mem[0,0] ; X64-NEXT: vorpd %xmm2, %xmm1, %xmm1 @@ -1111,7 +1118,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X86-NEXT: # xmm1 = mem[0,0] +; X86-NEXT: vandpd %xmm1, %xmm0, %xmm1 ; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] ; X86-NEXT: # xmm2 = mem[0,0] ; X86-NEXT: vorpd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -226,11 +226,19 @@ } define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: test10: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; CHECK-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: retq +; SKX-LABEL: test10: +; SKX: # %bb.0: # %entry +; SKX-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; SKX-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; SKX-NEXT: retq +; +; KNL-LABEL: test10: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; KNL-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; KNL-NEXT: # xmm1 = mem[0,0] +; KNL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2 %sub.i = fsub <2 x double> , %0 @@ -305,7 +313,7 @@ define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { ; SKX-LABEL: test13: ; SKX: # %bb.0: # %entry -; SKX-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; SKX-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm3 ; SKX-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2 ; SKX-NEXT: kmovd %edi, %k1 ; SKX-NEXT: vmovsd %xmm1, %xmm3, %xmm3 {%k1} @@ -314,7 +322,9 @@ ; ; KNL-LABEL: test13: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; KNL-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; KNL-NEXT: # xmm3 = mem[0,0] +; KNL-NEXT: vxorpd %xmm3, %xmm0, %xmm3 ; KNL-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2 ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vmovsd %xmm1, %xmm3, %xmm3 {%k1} diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1481,7 +1481,8 @@ ; ; AVX512-INFS-LABEL: test_v2f64_interp: ; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; AVX512-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; AVX512-INFS-NEXT: # xmm3 = mem[0,0] ; AVX512-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 ; AVX512-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 ; AVX512-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -226,17 +226,25 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; -; FMA-LABEL: f8: -; FMA: # %bb.0: # %entry -; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: retq +; FMA-AVX1-LABEL: f8: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f8: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq +; +; FMA-AVX512-LABEL: f8: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: # xmm1 = mem[0,0] +; FMA-AVX512-NEXT: vxorpd %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq entry: %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.dynamic", @@ -301,17 +309,25 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; -; FMA-LABEL: f10: -; FMA: # %bb.0: # %entry -; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: retq +; FMA-AVX1-LABEL: f10: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f10: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq +; +; FMA-AVX512-LABEL: f10: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: # xmm1 = mem[0,0] +; FMA-AVX512-NEXT: vxorpd %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq entry: %3 = fneg double %0 %4 = fneg double %2 @@ -930,17 +946,25 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; -; FMA-LABEL: f26: -; FMA: # %bb.0: # %entry -; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: retq +; FMA-AVX1-LABEL: f26: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f26: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq +; +; FMA-AVX512-LABEL: f26: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; FMA-AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: # xmm1 = mem[0,0] +; FMA-AVX512-NEXT: vxorpd %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq entry: %3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2, metadata !"round.dynamic", @@ -1053,17 +1077,25 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq ; -; FMA-LABEL: f28: -; FMA: # %bb.0: # %entry -; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: retq +; FMA-AVX1-LABEL: f28: +; FMA-AVX1: # %bb.0: # %entry +; FMA-AVX1-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f28: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 ; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq +; +; FMA-AVX512-LABEL: f28: +; FMA-AVX512: # %bb.0: # %entry +; FMA-AVX512-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 +; FMA-AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; FMA-AVX512-NEXT: # xmm1 = mem[0,0] +; FMA-AVX512-NEXT: vxorpd %xmm1, %xmm0, %xmm0 +; FMA-AVX512-NEXT: retq entry: %3 = fneg <2 x double> %0 %4 = fneg <2 x double> %2 diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -138,7 +138,7 @@ ; AVX512-LABEL: round_f64: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -239,8 +239,8 @@ ; ; AVX512-LABEL: round_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1315,22 +1315,6 @@ ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 -; -; X64-AVX-LABEL: TestTruncCopysign: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: cmpl $50001, %edi # imm = 0xC351 -; X64-AVX-NEXT: jl .LBB26_2 -; X64-AVX-NEXT: # %bb.1: # %if.then -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: callq __trunctfdf2@PLT -; X64-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [+Inf,+Inf] -; X64-AVX-NEXT: # xmm1 = mem[0,0] -; X64-AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: callq __extenddftf2@PLT -; X64-AVX-NEXT: addq $8, %rsp -; X64-AVX-NEXT: .LBB26_2: # %cleanup -; X64-AVX-NEXT: retq entry: %cmp = icmp sgt i32 %n, 50000 br i1 %cmp, label %if.then, label %cleanup diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX512 ; @@ -18,20 +18,31 @@ ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_fshl_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $5, %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_fshl_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshl_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $5, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshl_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -47,19 +58,29 @@ ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_fshr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_fshr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshr_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %xmm1, %xmm1 -; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpternlogq $234, %xmm2, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -109,9 +130,11 @@ ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -165,9 +188,11 @@ ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $6, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -238,7 +263,7 @@ ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -251,8 +276,9 @@ ; GFNIAVX512-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm1, %zmm1 -; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm2 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpternlogq $234, %zmm2, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res @@ -323,7 +349,7 @@ ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX512 ; @@ -19,20 +19,31 @@ ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_rotl_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $5, %xmm0, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_rotl_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $5, %xmm0, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotl_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $5, %xmm0, %xmm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotl_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -49,19 +60,29 @@ ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_rotr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm1 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_rotr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %xmm0, %xmm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotr_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %xmm0, %xmm1 -; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpternlogq $234, %xmm2, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -111,9 +132,11 @@ ; GFNIAVX2-LABEL: splatconstant_rotl_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -167,9 +190,11 @@ ; GFNIAVX2-LABEL: splatconstant_rotr_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $6, %ymm0, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -242,7 +267,7 @@ ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -255,8 +280,9 @@ ; GFNIAVX512-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm1 -; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm2 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpternlogq $234, %zmm2, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) ret <64 x i8> %res @@ -328,7 +354,7 @@ ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX512 ; ; 128 Bit Vector Shifts @@ -15,11 +15,25 @@ ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_shl_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_shl_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_shl_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; GFNIAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_shl_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; GFNIAVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = shl <16 x i8> %a, ret <16 x i8> %shift } @@ -31,11 +45,25 @@ ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_lshr_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsrlw $7, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpsrlw $7, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = lshr <16 x i8> %a, ret <16 x i8> %shift } @@ -50,21 +78,32 @@ ; GFNISSE-NEXT: psubb %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX512-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; GFNIAVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -98,13 +137,15 @@ ; GFNIAVX2-LABEL: splatconstant_shl_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_shl_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -134,13 +175,15 @@ ; GFNIAVX2-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -180,8 +223,9 @@ ; GFNIAVX2-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -189,9 +233,10 @@ ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX512-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1 +; GFNIAVX512-NEXT: vpsubb %ymm2, %ymm1, %ymm0 ; GFNIAVX512-NEXT: retq %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -235,7 +280,7 @@ ; GFNIAVX2-LABEL: splatconstant_shl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -244,7 +289,8 @@ ; GFNIAVX512-LABEL: splatconstant_shl_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $5, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -284,7 +330,7 @@ ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -293,7 +339,8 @@ ; GFNIAVX512-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -351,9 +398,9 @@ ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 @@ -365,9 +412,10 @@ ; GFNIAVX512-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; GFNIAVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX512-NEXT: vpternlogq $108, %zmm0, %zmm2, %zmm1 +; GFNIAVX512-NEXT: vpsubb %zmm2, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -218,14 +218,24 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: @@ -249,14 +259,34 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = icmp sgt <8 x i16> %a0, %1 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 @@ -315,16 +345,28 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorb $127, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: @@ -370,16 +412,40 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorb $127, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %2 = icmp sgt <16 x i8> %a0, %1 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 @@ -747,7 +813,8 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -795,7 +862,8 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -807,7 +875,8 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -899,7 +968,8 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -977,7 +1047,8 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -991,7 +1062,8 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1508,7 +1580,8 @@ ; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -1564,7 +1637,8 @@ ; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -1578,7 +1652,8 @@ ; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF @@ -1689,7 +1764,8 @@ ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1783,7 +1859,8 @@ ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1799,7 +1876,8 @@ ; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1857,15 +1935,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1889,15 +1978,37 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp sgt <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1934,15 +2045,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1966,15 +2088,37 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp sgt <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2033,17 +2177,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorb $127, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2089,17 +2246,43 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorb $127, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp sgt <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2161,17 +2344,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorb $127, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2217,17 +2413,43 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorb $127, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorb $127, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorb $127, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp sgt <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -220,14 +220,24 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v8i16: ; X64-SSE2: ## %bb.0: @@ -251,14 +261,34 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = icmp slt <8 x i16> %a0, %1 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 @@ -317,16 +347,28 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: addb $-128, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i8: ; X64-SSE2: ## %bb.0: @@ -372,16 +414,40 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: addb $-128, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %2 = icmp slt <16 x i8> %a0, %1 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 @@ -751,7 +817,8 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -799,7 +866,8 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -811,7 +879,8 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -903,7 +972,8 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -981,7 +1051,8 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -995,7 +1066,8 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1512,7 +1584,8 @@ ; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1568,7 +1641,8 @@ ; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1582,7 +1656,8 @@ ; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 @@ -1693,7 +1768,8 @@ ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1787,7 +1863,8 @@ ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1803,7 +1880,8 @@ ; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1861,15 +1939,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1893,15 +1982,37 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp slt <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1938,15 +2049,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1970,15 +2092,37 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp slt <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2037,17 +2181,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: addb $-128, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2093,17 +2250,43 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: addb $-128, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp slt <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2165,17 +2348,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: addb $-128, %al +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2221,17 +2417,43 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: addb $-128, %al -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: addb $-128, %al +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: addb $-128, %al +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp slt <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -122,7 +122,7 @@ ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -124,7 +124,7 @@ ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -366,16 +366,16 @@ ; ; X64-AVX512F-LABEL: clamp_sitofp_2i64_2f64: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; X64-AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512DQ-LABEL: clamp_sitofp_2i64_2f64: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX512DQ-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %clo = icmp slt <2 x i64> %a, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -772,7 +772,8 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -425,7 +425,7 @@ ; AVX2-LABEL: reassociate_umax_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4 @@ -723,7 +723,7 @@ ; AVX2-LABEL: reassociate_umin_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -2474,10 +2474,10 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2499,8 +2499,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; @@ -2613,34 +2613,63 @@ ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [32767,32767] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2681,8 +2710,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer @@ -2783,33 +2812,61 @@ ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2850,8 +2907,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer @@ -6574,8 +6631,10 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6583,8 +6642,10 @@ ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BWVL-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -7211,8 +7272,10 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BWVL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -7508,8 +7571,10 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; AVX512BWVL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512BWVL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=SSE4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512BW @@ -2154,9 +2154,11 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2178,7 +2180,7 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; @@ -2273,33 +2275,63 @@ ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2340,7 +2372,7 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer @@ -2423,32 +2455,61 @@ ; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2489,7 +2550,7 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer @@ -5902,7 +5963,7 @@ ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -6142,7 +6203,7 @@ ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -6383,7 +6444,8 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6391,7 +6453,8 @@ ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -6772,7 +6835,8 @@ ; AVX2-LABEL: truncstore_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 @@ -6897,7 +6961,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovmskb %xmm1, %eax @@ -7031,7 +7096,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -7181,72 +7247,140 @@ ; SSE4-NEXT: pextrb $7, %xmm0, 7(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpmovmskb %xmm1, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB17_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB17_3 -; AVX-NEXT: .LBB17_4: # %else2 -; AVX-NEXT: testb $4, %al -; AVX-NEXT: jne .LBB17_5 -; AVX-NEXT: .LBB17_6: # %else4 -; AVX-NEXT: testb $8, %al -; AVX-NEXT: jne .LBB17_7 -; AVX-NEXT: .LBB17_8: # %else6 -; AVX-NEXT: testb $16, %al -; AVX-NEXT: jne .LBB17_9 -; AVX-NEXT: .LBB17_10: # %else8 -; AVX-NEXT: testb $32, %al -; AVX-NEXT: jne .LBB17_11 -; AVX-NEXT: .LBB17_12: # %else10 -; AVX-NEXT: testb $64, %al -; AVX-NEXT: jne .LBB17_13 -; AVX-NEXT: .LBB17_14: # %else12 -; AVX-NEXT: testb $-128, %al -; AVX-NEXT: jne .LBB17_15 -; AVX-NEXT: .LBB17_16: # %else14 -; AVX-NEXT: retq -; AVX-NEXT: .LBB17_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB17_4 -; AVX-NEXT: .LBB17_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: testb $4, %al -; AVX-NEXT: je .LBB17_6 -; AVX-NEXT: .LBB17_5: # %cond.store3 -; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX-NEXT: testb $8, %al -; AVX-NEXT: je .LBB17_8 -; AVX-NEXT: .LBB17_7: # %cond.store5 -; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX-NEXT: testb $16, %al -; AVX-NEXT: je .LBB17_10 -; AVX-NEXT: .LBB17_9: # %cond.store7 -; AVX-NEXT: vpextrb $4, %xmm0, 4(%rdi) -; AVX-NEXT: testb $32, %al -; AVX-NEXT: je .LBB17_12 -; AVX-NEXT: .LBB17_11: # %cond.store9 -; AVX-NEXT: vpextrb $5, %xmm0, 5(%rdi) -; AVX-NEXT: testb $64, %al -; AVX-NEXT: je .LBB17_14 -; AVX-NEXT: .LBB17_13: # %cond.store11 -; AVX-NEXT: vpextrb $6, %xmm0, 6(%rdi) -; AVX-NEXT: testb $-128, %al -; AVX-NEXT: je .LBB17_16 -; AVX-NEXT: .LBB17_15: # %cond.store13 -; AVX-NEXT: vpextrb $7, %xmm0, 7(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v8i16_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB17_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB17_3 +; AVX1-NEXT: .LBB17_4: # %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne .LBB17_5 +; AVX1-NEXT: .LBB17_6: # %else4 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne .LBB17_7 +; AVX1-NEXT: .LBB17_8: # %else6 +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: jne .LBB17_9 +; AVX1-NEXT: .LBB17_10: # %else8 +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: jne .LBB17_11 +; AVX1-NEXT: .LBB17_12: # %else10 +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: jne .LBB17_13 +; AVX1-NEXT: .LBB17_14: # %else12 +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: jne .LBB17_15 +; AVX1-NEXT: .LBB17_16: # %else14 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB17_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB17_4 +; AVX1-NEXT: .LBB17_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je .LBB17_6 +; AVX1-NEXT: .LBB17_5: # %cond.store3 +; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je .LBB17_8 +; AVX1-NEXT: .LBB17_7: # %cond.store5 +; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX1-NEXT: testb $16, %al +; AVX1-NEXT: je .LBB17_10 +; AVX1-NEXT: .LBB17_9: # %cond.store7 +; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX1-NEXT: testb $32, %al +; AVX1-NEXT: je .LBB17_12 +; AVX1-NEXT: .LBB17_11: # %cond.store9 +; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX1-NEXT: testb $64, %al +; AVX1-NEXT: je .LBB17_14 +; AVX1-NEXT: .LBB17_13: # %cond.store11 +; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX1-NEXT: testb $-128, %al +; AVX1-NEXT: je .LBB17_16 +; AVX1-NEXT: .LBB17_15: # %cond.store13 +; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmovmskb %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB17_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB17_3 +; AVX2-NEXT: .LBB17_4: # %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne .LBB17_5 +; AVX2-NEXT: .LBB17_6: # %else4 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne .LBB17_7 +; AVX2-NEXT: .LBB17_8: # %else6 +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: jne .LBB17_9 +; AVX2-NEXT: .LBB17_10: # %else8 +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: jne .LBB17_11 +; AVX2-NEXT: .LBB17_12: # %else10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: jne .LBB17_13 +; AVX2-NEXT: .LBB17_14: # %else12 +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: jne .LBB17_15 +; AVX2-NEXT: .LBB17_16: # %else14 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB17_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB17_4 +; AVX2-NEXT: .LBB17_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: je .LBB17_6 +; AVX2-NEXT: .LBB17_5: # %cond.store3 +; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je .LBB17_8 +; AVX2-NEXT: .LBB17_7: # %cond.store5 +; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX2-NEXT: testb $16, %al +; AVX2-NEXT: je .LBB17_10 +; AVX2-NEXT: .LBB17_9: # %cond.store7 +; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX2-NEXT: testb $32, %al +; AVX2-NEXT: je .LBB17_12 +; AVX2-NEXT: .LBB17_11: # %cond.store9 +; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: je .LBB17_14 +; AVX2-NEXT: .LBB17_13: # %cond.store11 +; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX2-NEXT: testb $-128, %al +; AVX2-NEXT: je .LBB17_16 +; AVX2-NEXT: .LBB17_15: # %cond.store13 +; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v8i16_v8i8: ; AVX512F: # %bb.0: @@ -7255,7 +7389,8 @@ ; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -7332,7 +7467,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -2,13 +2,13 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW ; https://llvm.org/bugs/show_bug.cgi?id=27100 @@ -26,11 +26,23 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_16_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: memset_16_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_16_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_16_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-NEXT: vmovdqu %xmm0, (%rdi) +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) ret void } @@ -52,12 +64,26 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_32_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: memset_32_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_32_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_32_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1) ret void } @@ -95,9 +121,9 @@ ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -110,8 +136,8 @@ ; ; AVX512BW-LABEL: memset_64_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; AVX512NW-NEXT: retq @@ -166,11 +192,11 @@ ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm0, 64(%rdi) -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -184,9 +210,9 @@ ; ; AVX512BW-LABEL: memset_128_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) @@ -237,15 +263,15 @@ ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 224(%rdi) -; AVX2-NEXT: vmovups %ymm0, 192(%rdi) -; AVX2-NEXT: vmovups %ymm0, 160(%rdi) -; AVX2-NEXT: vmovups %ymm0, 128(%rdi) -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm0, 64(%rdi) -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -261,11 +287,11 @@ ; ; AVX512BW-LABEL: memset_256_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, 192(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, 128(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-FALLBACK ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2-FALLBACK -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP,XOP-FALLBACK -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP-FALLBACK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512VL-FALLBACK ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW-FALLBACK @@ -975,7 +975,8 @@ ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg: ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 @@ -992,25 +993,66 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i64_signed_reg_reg: -; XOP: # %bb.0: -; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm3, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i64_signed_reg_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i64_signed_reg_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i64_signed_reg_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 +; XOPAVX2-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_reg_reg: ; AVX512F: # %bb.0: @@ -1018,7 +1060,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1054,7 +1096,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1194,11 +1236,12 @@ ; ; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg: ; AVX2-FALLBACK: # %bb.0: -; AVX2-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-FALLBACK-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm5 = [1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm5, %xmm4, %xmm5 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm1 @@ -1215,25 +1258,66 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i64_unsigned_reg_reg: -; XOP: # %bb.0: -; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpcomltuq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm3, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i64_unsigned_reg_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpcomltuq %xmm1, %xmm0, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i64_unsigned_reg_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 +; XOPAVX2-NEXT: vpcomltuq %xmm1, %xmm0, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_unsigned_reg_reg: ; AVX512F: # %bb.0: @@ -1241,7 +1325,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1277,7 +1361,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1421,7 +1505,8 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 @@ -1438,26 +1523,69 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i64_signed_mem_reg: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpcomltq %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpsubq %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpsrlq $1, %xmm0, %xmm2 -; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; XOP-NEXT: vpsrlq $32, %xmm3, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 -; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i64_signed_mem_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpcomltq %xmm0, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpsrlq $1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOP-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i64_signed_mem_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX1-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpcomltq %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i64_signed_mem_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX2-NEXT: vpcomgtq %xmm0, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 +; XOPAVX2-NEXT: vpcomltq %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm4, %xmm1, %xmm0, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpsrlq $33, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOPAVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_mem_reg: ; AVX512F: # %bb.0: @@ -1465,7 +1593,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1502,7 +1630,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1644,7 +1772,8 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 @@ -1661,26 +1790,69 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i64_signed_reg_mem: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm3, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i64_signed_reg_mem: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i64_signed_reg_mem: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i64_signed_reg_mem: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX2-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 +; XOPAVX2-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_reg_mem: ; AVX512F: # %bb.0: @@ -1688,7 +1860,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1725,7 +1897,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1871,7 +2043,8 @@ ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 ; AVX2-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 @@ -1888,27 +2061,72 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i64_signed_mem_mem: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm0 -; XOP-NEXT: vmovdqa (%rsi), %xmm1 -; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 -; XOP-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 -; XOP-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 -; XOP-NEXT: vpsubq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm2 -; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq $32, %xmm3, %xmm4 -; XOP-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 -; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i64_signed_mem_mem: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 +; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOP-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i64_signed_mem_mem: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 +; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1 +; XOPAVX1-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i64_signed_mem_mem: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 +; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 +; XOPAVX2-NEXT: vpcomgtq %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 +; XOPAVX2-NEXT: vpcomltq %xmm1, %xmm0, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsubq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; XOPAVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; XOPAVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_mem_mem: ; AVX512F: # %bb.0: @@ -1916,7 +2134,7 @@ ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1954,7 +2172,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -2020,7 +2238,8 @@ ; AVX2-FALLBACK-LABEL: vec128_i16_signed_reg_reg: ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2029,21 +2248,45 @@ ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i16_signed_reg_reg: -; XOP: # %bb.0: -; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 -; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i16_signed_reg_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOP-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i16_signed_reg_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i16_signed_reg_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i16_signed_reg_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2070,7 +2313,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2159,7 +2402,8 @@ ; AVX2-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 @@ -2167,23 +2411,47 @@ ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i16_unsigned_reg_reg: -; XOP: # %bb.0: -; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpminuw %xmm1, %xmm0, %xmm3 -; XOP-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; XOP-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i16_unsigned_reg_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i16_unsigned_reg_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i16_unsigned_reg_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 @@ -2211,7 +2479,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 @@ -2279,7 +2547,8 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0 @@ -2288,23 +2557,49 @@ ; AVX2-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i16_signed_mem_reg: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm0, %xmm1, %xmm3 -; XOP-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpsubw %xmm3, %xmm0, %xmm0 -; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0 -; XOP-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i16_signed_mem_reg: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i16_signed_mem_reg: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX1-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpminsw %xmm0, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i16_signed_mem_reg: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX2-NEXT: vpcomgtw %xmm0, %xmm1, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpminsw %xmm0, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpmacsww %xmm1, %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i16_signed_mem_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsw %xmm0, %xmm1, %xmm3 ; AVX512F-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpsubw %xmm3, %xmm0, %xmm0 @@ -2332,7 +2627,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 @@ -2400,7 +2695,8 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2409,23 +2705,49 @@ ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i16_signed_reg_mem: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 -; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i16_signed_reg_mem: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOP-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i16_signed_reg_mem: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX1-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i16_signed_reg_mem: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 +; XOPAVX2-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i16_signed_reg_mem: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2453,7 +2775,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2524,7 +2846,8 @@ ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2533,25 +2856,53 @@ ; AVX2-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; -; XOP-LABEL: vec128_i16_signed_mem_mem: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa (%rdi), %xmm0 -; XOP-NEXT: vmovdqa (%rsi), %xmm1 -; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 -; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpminsw %xmm1, %xmm0, %xmm3 -; XOP-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpsubw %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-NEXT: retq +; XOP-FALLBACK-LABEL: vec128_i16_signed_mem_mem: +; XOP-FALLBACK: # %bb.0: +; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 +; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 +; XOP-FALLBACK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOP-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: retq +; +; XOPAVX1-LABEL: vec128_i16_signed_mem_mem: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 +; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1 +; XOPAVX1-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: vec128_i16_signed_mem_mem: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 +; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 +; XOPAVX2-NEXT: vpcomgtw %xmm1, %xmm0, %xmm2 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpmacsww %xmm0, %xmm2, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512F-LABEL: vec128_i16_signed_mem_mem: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubw %xmm3, %xmm1, %xmm1 @@ -2580,7 +2931,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2707,16 +3058,19 @@ ; AVX2-FALLBACK-LABEL: vec128_i8_signed_reg_reg: ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-FALLBACK-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2764,7 +3118,8 @@ ; XOPAVX2-LABEL: vec128_i8_signed_reg_reg: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 @@ -2773,7 +3128,8 @@ ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2783,12 +3139,14 @@ ; AVX512F-LABEL: vec128_i8_signed_reg_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -2804,7 +3162,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -2817,13 +3176,14 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -2839,7 +3199,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2940,15 +3301,18 @@ ; AVX2-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-FALLBACK-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-FALLBACK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2996,7 +3360,8 @@ ; XOPAVX2-LABEL: vec128_i8_unsigned_reg_reg: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpcomgtub %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpminub %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 @@ -3005,7 +3370,8 @@ ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3017,11 +3383,13 @@ ; AVX512F-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3037,7 +3405,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -3051,13 +3420,14 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3073,7 +3443,8 @@ ; AVX512VLBW-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3181,16 +3552,19 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-FALLBACK-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FALLBACK-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3241,7 +3615,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 ; XOPAVX2-NEXT: vpcomgtb %xmm0, %xmm1, %xmm2 -; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpminsb %xmm0, %xmm1, %xmm3 ; XOPAVX2-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm0, %xmm0 @@ -3250,7 +3625,8 @@ ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3261,12 +3637,14 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsb %xmm0, %xmm1, %xmm3 ; AVX512F-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -3283,7 +3661,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm0, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 @@ -3296,13 +3675,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -3319,7 +3699,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm0, %xmm2, %xmm0 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -3424,16 +3805,19 @@ ; AVX2-FALLBACK: # %bb.0: ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-FALLBACK-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3484,7 +3868,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 ; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 @@ -3493,7 +3878,8 @@ ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3504,12 +3890,14 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3526,7 +3914,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -3539,13 +3928,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3562,7 +3952,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3671,16 +4062,19 @@ ; AVX2-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX2-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FALLBACK-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX2-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX2-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-FALLBACK-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX2-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FALLBACK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FALLBACK-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3734,7 +4128,8 @@ ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 ; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 ; XOPAVX2-NEXT: vpcomgtb %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm1, %xmm1 @@ -3743,7 +4138,8 @@ ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; XOPAVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; XOPAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; XOPAVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -3755,12 +4151,14 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpminsb %xmm1, %xmm0, %xmm3 ; AVX512F-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3778,7 +4176,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -3791,13 +4190,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 @@ -3815,7 +4215,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1590,7 +1590,8 @@ ; AVX2-LABEL: vec256_i16_signed_reg_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -1646,7 +1647,8 @@ ; AVX512F-LABEL: vec256_i16_signed_reg_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -1673,7 +1675,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1739,7 +1741,8 @@ ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 ; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 @@ -1796,7 +1799,8 @@ ; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 @@ -1823,7 +1827,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 @@ -1887,7 +1891,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsw %ymm0, %ymm1, %ymm3 ; AVX2-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0 @@ -1946,7 +1951,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsw %ymm0, %ymm1, %ymm3 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 @@ -1974,7 +1980,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 @@ -2038,7 +2044,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -2097,7 +2104,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -2125,7 +2133,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -2191,7 +2199,8 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -2253,7 +2262,8 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 @@ -2282,7 +2292,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -2372,16 +2382,18 @@ ; AVX2-LABEL: vec256_i8_signed_reg_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2470,16 +2482,18 @@ ; AVX512F-LABEL: vec256_i8_signed_reg_reg: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2495,7 +2509,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 @@ -2508,13 +2523,14 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 @@ -2529,7 +2545,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -2599,15 +2616,17 @@ ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 ; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2698,15 +2717,17 @@ ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2722,7 +2743,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -2736,13 +2758,14 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 @@ -2757,7 +2780,8 @@ ; AVX512VLBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -2825,16 +2849,18 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsb %ymm0, %ymm1, %ymm3 ; AVX2-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2926,16 +2952,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm3 ; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2952,7 +2980,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm0, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 @@ -2965,13 +2994,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm0, %zmm0 @@ -2987,7 +3017,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm0, %ymm2, %ymm0 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 @@ -3054,16 +3085,18 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -3155,16 +3188,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -3181,7 +3216,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 @@ -3194,13 +3230,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 @@ -3216,7 +3253,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 @@ -3285,16 +3323,18 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -3389,16 +3429,18 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -3416,7 +3458,8 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 @@ -3429,13 +3472,14 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-FALLBACK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-FALLBACK-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero ; AVX512BW-FALLBACK-NEXT: vpmullw %zmm2, %zmm1, %zmm1 @@ -3452,7 +3496,8 @@ ; AVX512VLBW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 {%k1} ; AVX512VLBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -703,7 +703,9 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -732,7 +734,9 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -752,7 +756,8 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -785,7 +790,9 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm2 @@ -814,7 +821,9 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm4, %ymm2 @@ -834,7 +843,8 @@ ; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -855,61 +865,65 @@ define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 ; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -921,7 +935,8 @@ ; AVX512BW-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 @@ -956,7 +971,9 @@ ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -986,7 +1003,9 @@ ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 @@ -1007,7 +1026,8 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -1027,64 +1047,68 @@ define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_mem: @@ -1096,7 +1120,8 @@ ; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -662,7 +662,7 @@ ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -723,7 +723,7 @@ ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -1643,7 +1643,8 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" { ; CHECK-AVX512-LABEL: var_rotate_v16i8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; CHECK-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; CHECK-AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -1657,8 +1658,9 @@ ; CHECK-VBMI: # %bb.0: ; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; CHECK-VBMI-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-VBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; CHECK-VBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1675,7 +1677,8 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: var_rotate_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -1750,7 +1753,8 @@ ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -931,7 +931,8 @@ ; ; SKX-LABEL: allones_v16i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: retq @@ -1201,7 +1202,8 @@ ; ; SKX-LABEL: allzeros_v16i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -1253,7 +1255,8 @@ ; ; SKX-LABEL: allones_v32i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -1306,7 +1309,8 @@ ; ; SKX-LABEL: allzeros_v32i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -1357,8 +1361,10 @@ ; KNL-LABEL: allones_v64i8_and1: ; KNL: # %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: cmpl $-1, %eax ; KNL-NEXT: sete %al @@ -1367,7 +1373,8 @@ ; ; SKX-LABEL: allones_v64i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestq %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -1418,8 +1425,10 @@ ; KNL-LABEL: allzeros_v64i8_and1: ; KNL: # %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al @@ -1428,7 +1437,8 @@ ; ; SKX-LABEL: allzeros_v64i8_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmb %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestq %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -1473,7 +1483,8 @@ ; ; SKX-LABEL: allones_v8i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: retq @@ -1515,7 +1526,8 @@ ; ; SKX-LABEL: allzeros_v8i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -1573,7 +1585,8 @@ ; ; SKX-LABEL: allones_v16i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -1649,7 +1662,8 @@ ; ; SKX-LABEL: allones_v32i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -1719,7 +1733,8 @@ ; ; SKX-LABEL: allzeros_v32i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -1776,7 +1791,8 @@ ; ; SKX-LABEL: allzeros_v16i16_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SKX-NEXT: vptestmw %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2118,8 +2134,7 @@ ; KNL-LABEL: allones_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] -; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al @@ -2128,7 +2143,7 @@ ; ; SKX-LABEL: allones_v2i64_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: cmpb $3, %al ; SKX-NEXT: sete %al @@ -2160,8 +2175,7 @@ ; KNL-LABEL: allzeros_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al @@ -2170,7 +2184,7 @@ ; ; SKX-LABEL: allzeros_v2i64_and1: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -2452,7 +2466,8 @@ ; ; SKX-LABEL: allones_v16i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: retq @@ -2490,7 +2505,8 @@ ; ; SKX-LABEL: allzeros_v16i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -2542,7 +2558,8 @@ ; ; SKX-LABEL: allones_v32i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2595,7 +2612,8 @@ ; ; SKX-LABEL: allzeros_v32i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2646,8 +2664,10 @@ ; KNL-LABEL: allones_v64i8_and4: ; KNL: # %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; KNL-NEXT: vpand %ymm0, %ymm1, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: cmpl $-1, %eax ; KNL-NEXT: sete %al @@ -2656,7 +2676,8 @@ ; ; SKX-LABEL: allones_v64i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestq %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2707,8 +2728,10 @@ ; KNL-LABEL: allzeros_v64i8_and4: ; KNL: # %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovmskb %ymm0, %eax ; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al @@ -2717,7 +2740,8 @@ ; ; SKX-LABEL: allzeros_v64i8_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmb %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestq %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2762,7 +2786,8 @@ ; ; SKX-LABEL: allones_v8i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: retq @@ -2804,7 +2829,8 @@ ; ; SKX-LABEL: allzeros_v8i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %xmm1, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -2862,7 +2888,8 @@ ; ; SKX-LABEL: allones_v16i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -2938,7 +2965,8 @@ ; ; SKX-LABEL: allones_v32i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: setb %al ; SKX-NEXT: vzeroupper @@ -3008,7 +3036,8 @@ ; ; SKX-LABEL: allzeros_v32i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %zmm1, %zmm0, %k0 ; SKX-NEXT: kortestd %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -3065,7 +3094,8 @@ ; ; SKX-LABEL: allzeros_v16i16_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; SKX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; SKX-NEXT: vptestmw %ymm1, %ymm0, %k0 ; SKX-NEXT: kortestw %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -3407,8 +3437,7 @@ ; KNL-LABEL: allones_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al @@ -3417,7 +3446,7 @@ ; ; SKX-LABEL: allones_v2i64_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: cmpb $3, %al ; SKX-NEXT: sete %al @@ -3449,8 +3478,7 @@ ; KNL-LABEL: allzeros_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al @@ -3459,7 +3487,7 @@ ; ; SKX-LABEL: allzeros_v2i64_and4: ; SKX: # %bb.0: -; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 ; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -54,10 +54,22 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <16 x i8> %x, %2 = icmp ugt <16 x i8> %x, %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 @@ -70,10 +82,22 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test3: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test3: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <16 x i8> %x, %2 = icmp ugt <16 x i8> %x, %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 @@ -86,10 +110,22 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test4: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <16 x i8> %x, %2 = icmp ugt <16 x i8> %x, %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 @@ -121,7 +157,8 @@ ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxub %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 @@ -131,7 +168,8 @@ ; ; AVX512-LABEL: test5: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltub %xmm0, %xmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} @@ -149,10 +187,22 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test6: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test6: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <16 x i8> %x, %2 = icmp ugt <16 x i8> %x, %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 @@ -227,12 +277,14 @@ ; ; AVX2-LABEL: test8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <32 x i8> %x, %2 = icmp ugt <32 x i8> %x, @@ -259,12 +311,14 @@ ; ; AVX2-LABEL: test9: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test9: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX512-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <32 x i8> %x, %2 = icmp ugt <32 x i8> %x, @@ -291,12 +345,14 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test10: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX512-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <32 x i8> %x, %2 = icmp ugt <32 x i8> %x, @@ -340,7 +396,8 @@ ; ; AVX2-LABEL: test11: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpmaxub %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 @@ -350,7 +407,8 @@ ; ; AVX512-LABEL: test11: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpltub %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} @@ -381,12 +439,14 @@ ; ; AVX2-LABEL: test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test12: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <32 x i8> %x, %2 = icmp ugt <32 x i8> %x, @@ -487,14 +547,15 @@ ; ; AVX2-LABEL: test14: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test14: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <64 x i8> %x, %2 = icmp ugt <64 x i8> %x, @@ -527,14 +588,15 @@ ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test15: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <64 x i8> %x, %2 = icmp ugt <64 x i8> %x, @@ -567,14 +629,15 @@ ; ; AVX2-LABEL: test16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <64 x i8> %x, %2 = icmp ugt <64 x i8> %x, @@ -642,7 +705,7 @@ ; ; AVX2-LABEL: test17: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -658,7 +721,8 @@ ; ; AVX512-LABEL: test17: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltub %zmm0, %zmm1, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -695,14 +759,15 @@ ; ; AVX2-LABEL: test18: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test18: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <64 x i8> %x, %2 = icmp ugt <64 x i8> %x, @@ -756,10 +821,22 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test20: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test20: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <8 x i16> %x, %2 = icmp ugt <8 x i16> %x, %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 @@ -772,10 +849,22 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test21: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test21: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32769,32769,32769,32769,32769,32769,32769,32769] +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test21: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32769,32769,32769,32769,32769,32769,32769,32769] +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <8 x i16> %x, %2 = icmp ugt <8 x i16> %x, %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 @@ -788,10 +877,22 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test22: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test22: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test22: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65534,65534,65534,65534,65534,65534,65534,65534] +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test22: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65534,65534,65534,65534,65534,65534,65534,65534] +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <8 x i16> %x, %2 = icmp ugt <8 x i16> %x, %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 @@ -842,7 +943,8 @@ ; ; AVX2-LABEL: test23: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 @@ -852,7 +954,8 @@ ; ; AVX512-LABEL: test23: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltuw %xmm0, %xmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} @@ -870,10 +973,22 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test24: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test24: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test24: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = add <8 x i16> %x, %2 = icmp ugt <8 x i16> %x, %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 @@ -948,12 +1063,14 @@ ; ; AVX2-LABEL: test26: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test26: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <16 x i16> %x, %2 = icmp ugt <16 x i16> %x, @@ -980,12 +1097,14 @@ ; ; AVX2-LABEL: test27: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test27: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX512-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <16 x i16> %x, %2 = icmp ugt <16 x i16> %x, @@ -1012,12 +1131,14 @@ ; ; AVX2-LABEL: test28: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test28: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX512-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <16 x i16> %x, %2 = icmp ugt <16 x i16> %x, @@ -1093,7 +1214,8 @@ ; ; AVX2-LABEL: test29: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpmaxuw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 @@ -1103,7 +1225,8 @@ ; ; AVX512-LABEL: test29: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpcmpltuw %ymm0, %ymm1, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} @@ -1134,12 +1257,14 @@ ; ; AVX2-LABEL: test30: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test30: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = add <16 x i16> %x, %2 = icmp ugt <16 x i16> %x, @@ -1240,14 +1365,15 @@ ; ; AVX2-LABEL: test32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <32 x i16> %x, %2 = icmp ugt <32 x i16> %x, @@ -1280,14 +1406,15 @@ ; ; AVX2-LABEL: test33: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test33: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <32 x i16> %x, %2 = icmp ugt <32 x i16> %x, @@ -1320,14 +1447,15 @@ ; ; AVX2-LABEL: test34: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test34: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <32 x i16> %x, %2 = icmp ugt <32 x i16> %x, @@ -1451,7 +1579,7 @@ ; ; AVX2-LABEL: test35: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 @@ -1467,7 +1595,8 @@ ; ; AVX512-LABEL: test35: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpltuw %zmm0, %zmm1, %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} @@ -1504,14 +1633,15 @@ ; ; AVX2-LABEL: test36: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test36: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = add <32 x i16> %x, %2 = icmp ugt <32 x i16> %x, diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -37,8 +37,10 @@ ; AVX2-LABEL: mul_v16i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -47,7 +49,8 @@ ; AVX512F-LABEL: mul_v16i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -56,7 +59,8 @@ ; AVX512BW-LABEL: mul_v16i8c: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -74,7 +78,8 @@ ; ; AVX-LABEL: mul_v8i16c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117] +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > @@ -122,7 +127,7 @@ ; ; AVX-LABEL: mul_v2i64c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] +; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -171,7 +176,8 @@ ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -417,9 +423,9 @@ ; AVX2-LABEL: mul_v32i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -430,9 +436,9 @@ ; AVX512F-LABEL: mul_v32i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -443,7 +449,8 @@ ; AVX512BW-LABEL: mul_v32i8c: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq entry: @@ -461,7 +468,8 @@ ; ; AVX-LABEL: mul_v16i16c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq entry: %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > @@ -593,7 +601,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -607,7 +615,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -799,9 +807,9 @@ ; AVX2-LABEL: mul_v64i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 @@ -820,9 +828,9 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 @@ -841,9 +849,9 @@ ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 @@ -955,7 +963,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -980,7 +988,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -1004,7 +1012,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -376,7 +376,7 @@ ; AVX512F-LABEL: and_mulhuw_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/pr34605.ll b/llvm/test/CodeGen/X86/pr34605.ll --- a/llvm/test/CodeGen/X86/pr34605.ll +++ b/llvm/test/CodeGen/X86/pr34605.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: kmovd %ecx, %k1 ; CHECK-NEXT: kmovd %k1, %k1 ; CHECK-NEXT: kandq %k1, %k0, %k1 -; CHECK-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0 {%k1} {z} +; CHECK-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0 {%k1} {z} ; CHECK-NEXT: vmovdqu64 %zmm0, (%eax) ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, 64(%eax) diff --git a/llvm/test/CodeGen/X86/pr37499.ll b/llvm/test/CodeGen/X86/pr37499.ll --- a/llvm/test/CodeGen/X86/pr37499.ll +++ b/llvm/test/CodeGen/X86/pr37499.ll @@ -4,7 +4,7 @@ define <2 x i64> @undef_tval() { ; CHECK-LABEL: undef_tval: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 {%k1} @@ -18,7 +18,7 @@ define <2 x i64> @foo(<8 x i64> %x) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} @@ -33,7 +33,7 @@ define <4 x i64> @goo(<16 x i32> %x) { ; CHECK-LABEL: goo: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1} diff --git a/llvm/test/CodeGen/X86/pr43509.ll b/llvm/test/CodeGen/X86/pr43509.ll --- a/llvm/test/CodeGen/X86/pr43509.ll +++ b/llvm/test/CodeGen/X86/pr43509.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %k1 ; CHECK-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} +; CHECK-NEXT: vpbroadcastb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq bb: diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -10,7 +10,8 @@ ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX256-NEXT: vplzcntd %ymm0, %ymm0 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX256-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX256-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; @@ -19,7 +20,8 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -28,7 +30,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 false) @@ -41,7 +44,8 @@ ; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX256-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX256-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX256-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -54,7 +58,8 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 false) @@ -68,7 +73,7 @@ ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX256-NEXT: vplzcntd %ymm1, %ymm1 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX256-NEXT: vpbroadcastw {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX256-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX256-NEXT: vplzcntd %ymm0, %ymm0 @@ -82,7 +87,8 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 false) ret <16 x i16> %out @@ -94,7 +100,8 @@ ; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX256-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX256-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -108,7 +115,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll @@ -11,7 +11,8 @@ ; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX256-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX256-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -75,7 +76,8 @@ ; AVX256-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256-NEXT: vptestmd %ymm1, %ymm1, %k2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovdqa %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll @@ -36,7 +36,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) { ; AVX256-LABEL: testv16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -60,7 +60,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) { ; AVX256-LABEL: testv16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -86,7 +86,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) { ; CHECK-LABEL: testv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll @@ -9,12 +9,14 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256-LABEL: var_shl_v32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -32,12 +34,14 @@ ; ; AVX512VL-LABEL: var_shl_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -113,12 +117,14 @@ ; ; AVX256VL-LABEL: var_shl_v16i8: ; AVX256VL: # %bb.0: -; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX256VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX256VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpaddb %xmm0, %xmm0, %xmm2 @@ -151,16 +157,19 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256-LABEL: var_lshr_v32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX256-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX256-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX256-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX256-NEXT: retq @@ -175,16 +184,19 @@ ; ; AVX512VL-LABEL: var_lshr_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -257,16 +269,19 @@ ; ; AVX256VL-LABEL: var_lshr_v16i8: ; AVX256VL: # %bb.0: -; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX256VL-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX256VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX256VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX256VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX256VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX256VL-NEXT: vpaddb %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX256VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-trunc.ll @@ -11,7 +11,8 @@ define <16 x i8> @testv16i16_trunc_v16i8(<16 x i16> %x) { ; AVX256NOBW-LABEL: testv16i16_trunc_v16i8: ; AVX256NOBW: # %bb.0: -; AVX256NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256NOBW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX256NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256NOBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -9,7 +9,7 @@ ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -18,24 +18,29 @@ ; AVX256BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX256BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX256BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX256BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX256BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX256BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = udiv <32 x i8> %a, ret <32 x i8> %res @@ -47,7 +52,7 @@ ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX256BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -17,10 +17,22 @@ ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test1: -; AVX: # %bb.0: # %vector.ph -; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test1: +; AVX1: # %bb.0: # %vector.ph +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: # %vector.ph +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test1: +; AVX512: # %bb.0: # %vector.ph +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq vector.ph: %0 = icmp slt <8 x i16> %x, zeroinitializer %1 = xor <8 x i16> %x, @@ -37,10 +49,22 @@ ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ashr_xor_and: -; AVX: # %bb.0: -; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ashr_xor_and: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ashr_xor_and: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ashr_xor_and: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %signsplat = ashr <8 x i16> %x, %flipsign = xor <8 x i16> %x, %res = and <8 x i16> %signsplat, %flipsign @@ -53,10 +77,22 @@ ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ashr_add_and: -; AVX: # %bb.0: -; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ashr_add_and: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ashr_add_and: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ashr_add_and: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %signsplat = ashr <8 x i16> %x, %flipsign = add <8 x i16> %x, %res = and <8 x i16> %signsplat, %flipsign @@ -76,15 +112,37 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ashr_xor_and_commute_uses: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ashr_xor_and_commute_uses: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ashr_xor_and_commute_uses: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ashr_xor_and_commute_uses: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %signsplat = ashr <16 x i8> %x, store <16 x i8> %signsplat, <16 x i8>* %p1 %flipsign = xor <16 x i8> %x, @@ -222,10 +280,22 @@ ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # %bb.0: # %vector.ph -; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test2: +; AVX1: # %bb.0: # %vector.ph +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # %bb.0: # %vector.ph +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test2: +; AVX512: # %bb.0: # %vector.ph +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <8 x i16> %x, %1 = add <8 x i16> %x, @@ -277,10 +347,22 @@ ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test4: -; AVX: # %bb.0: # %vector.ph -; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test4: +; AVX1: # %bb.0: # %vector.ph +; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test4: +; AVX2: # %bb.0: # %vector.ph +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test4: +; AVX512: # %bb.0: # %vector.ph +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq vector.ph: %0 = icmp slt <16 x i8> %x, zeroinitializer %1 = xor <16 x i8> %x, @@ -294,10 +376,22 @@ ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test5: -; AVX: # %bb.0: # %vector.ph -; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test5: +; AVX1: # %bb.0: # %vector.ph +; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test5: +; AVX2: # %bb.0: # %vector.ph +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test5: +; AVX512: # %bb.0: # %vector.ph +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <16 x i8> %x, %1 = add <16 x i8> %x, @@ -379,12 +473,14 @@ ; ; AVX2-LABEL: test7: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test7: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq vector.ph: %0 = icmp slt <16 x i16> %x, zeroinitializer @@ -412,12 +508,14 @@ ; ; AVX2-LABEL: ashr_xor_and_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_xor_and_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %signsplat = ashr <16 x i16> %x, %flipsign = xor <16 x i16> %x, @@ -444,12 +542,14 @@ ; ; AVX2-LABEL: ashr_add_and_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: ashr_add_and_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %signsplat = ashr <16 x i16> %x, %flipsign = add <16 x i16> %x, @@ -476,12 +576,14 @@ ; ; AVX2-LABEL: test8: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test8: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <16 x i16> %x, @@ -582,12 +684,14 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test10: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq vector.ph: %0 = icmp slt <32 x i8> %x, zeroinitializer @@ -615,12 +719,14 @@ ; ; AVX2-LABEL: test11: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test11: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <32 x i8> %x, @@ -2220,10 +2326,22 @@ ; SSE-NEXT: psubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test19: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test19: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test19: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test19: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = icmp ugt <16 x i8> %x, %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> @@ -2254,10 +2372,22 @@ ; SSE-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test21: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test21: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test21: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [700,700,700,700,700,700,700,700] +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test21: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [700,700,700,700,700,700,700,700] +; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = icmp ugt <8 x i16> %x, %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> @@ -2301,12 +2431,14 @@ ; ; AVX2-LABEL: test23: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test23: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <32 x i8> %x, @@ -2365,12 +2497,14 @@ ; ; AVX2-LABEL: test25: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000] +; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test25: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000] +; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <16 x i16> %x, @@ -2435,14 +2569,15 @@ ; ; AVX2-LABEL: test27: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test27: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <64 x i8> %x, @@ -2559,11 +2694,25 @@ ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: test31: -; AVX: # %bb.0: -; AVX-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test31: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71] +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test31: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [71,71,71,71,71,71,71,71,71,71,71,71,71,71,71,71] +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq %t0 = bitcast <2 x i64> %x to <16 x i8> %cmp = icmp ugt <16 x i8> %t0, %bop = add <16 x i8> %t0, diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -130,21 +130,14 @@ } define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { -; X86-LABEL: illegal_no_extract_mul: -; X86: # %bb.0: -; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vpsrlw $10, %zmm0, %zmm1 -; X86-NEXT: vpsllw $6, %zmm0, %zmm0 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 -; X86-NEXT: retl -; -; X64-LABEL: illegal_no_extract_mul: -; X64: # %bb.0: -; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpsrlw $10, %zmm0, %zmm1 -; X64-NEXT: vpsllw $6, %zmm0, %zmm0 -; X64-NEXT: vporq %zmm1, %zmm0, %zmm0 -; X64-NEXT: retq +; CHECK-LABEL: illegal_no_extract_mul: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpsrlw $10, %zmm0, %zmm1 +; CHECK-NEXT: vpsllw $6, %zmm0, %zmm0 +; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = mul <32 x i16> %i, %rhs_mul = mul <32 x i16> %i, %rhs_shift = lshr <32 x i16> %rhs_mul, diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -530,14 +530,15 @@ ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +546,15 @@ ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,15 +562,16 @@ ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -1207,8 +1210,11 @@ ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm4 = mem[0,0] +; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 @@ -1217,8 +1223,11 @@ ; AVX512F-LABEL: v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; AVX512F-NEXT: # xmm3 = mem[0,0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm4 = mem[0,0] +; AVX512F-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 @@ -1232,8 +1241,8 @@ ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -368,8 +368,10 @@ ; ; AVX-LABEL: unsigned_sat_constant_v16i8_using_min: ; AVX: # %bb.0: -; AVX-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [213,213,213,213,213,213,213,213,213,213,213,213,213,213,213,213] +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %c = icmp ult <16 x i8> %x, %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> @@ -385,7 +387,8 @@ ; ; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: ; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a @@ -401,7 +404,8 @@ ; ; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: ; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, @@ -426,8 +430,10 @@ ; ; AVX-LABEL: unsigned_sat_constant_v8i16_using_min: ; AVX: # %bb.0: -; AVX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65493,65493,65493,65493,65493,65493,65493,65493] +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %c = icmp ult <8 x i16> %x, %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> @@ -443,7 +449,8 @@ ; ; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: ; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a @@ -459,7 +466,8 @@ ; ; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: ; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [42,42,42,42,42,42,42,42] +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, @@ -657,18 +665,21 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v2i64_using_min: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %c = icmp ult <2 x i64> %x, %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> @@ -726,18 +737,19 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, %a @@ -793,18 +805,19 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, @@ -1206,8 +1219,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775807,9223372036854775807] +; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -1275,7 +1290,7 @@ ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 @@ -1344,8 +1359,10 @@ ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -18,18 +18,27 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1 -; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -38,7 +47,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -261,7 +261,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: ret{{[l|q]}} @@ -271,7 +271,7 @@ ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-32-NEXT: vmovdqa {{.*#+}} ymm2 = <18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u> +; AVX2-32-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-32-NEXT: retl @@ -281,7 +281,7 @@ ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = <18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u,18778,u> +; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq @@ -359,7 +359,7 @@ ; ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -419,7 +419,7 @@ ; ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -502,7 +502,7 @@ ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -851,13 +851,23 @@ ; SSE-NEXT: divsd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: div_sqrt_fabs_f64: -; AVX: # %bb.0: -; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: div_sqrt_fabs_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmulsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: div_sqrt_fabs_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovddup {{.*#+}} xmm3 = [NaN,NaN] +; AVX512-NEXT: # xmm3 = mem[0,0] +; AVX512-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vmulsd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %s = call fast double @llvm.sqrt.f64(double %z) %a = call fast double @llvm.fabs.f64(double %y) %m = fmul fast double %s, %a @@ -1048,13 +1058,22 @@ ; SSE-NEXT: movupd %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: sqrt_simplify_before_recip_vec: -; AVX: # %bb.0: -; AVX-NEXT: vsqrtpd %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vmovupd %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: sqrt_simplify_before_recip_vec: +; AVX1: # %bb.0: +; AVX1-NEXT: vsqrtpd %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX1-NEXT: vdivpd %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vmovupd %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX512-LABEL: sqrt_simplify_before_recip_vec: +; AVX512: # %bb.0: +; AVX512-NEXT: vsqrtpd %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vdivpd %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovupd %xmm1, (%rdi) +; AVX512-NEXT: retq %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) %rsqrt = fdiv fast <2 x double> , %sqrt %sqrt_fast = fdiv fast <2 x double> %x, %sqrt diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2506,7 +2506,7 @@ ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero ; CHECK-AVX2-NEXT: vpackuswb %ymm6, %ymm4, %ymm4 @@ -2521,7 +2521,8 @@ ; CHECK-AVX2-NEXT: vpsrlw $8, %ymm6, %ymm6 ; CHECK-AVX2-NEXT: vpackuswb %ymm4, %ymm6, %ymm4 ; CHECK-AVX2-NEXT: vpsrlw $7, %ymm3, %ymm3 -; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; CHECK-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-AVX2-NEXT: vpand %ymm6, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3 ; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -2543,7 +2544,7 @@ ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 @@ -2561,7 +2562,8 @@ ; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 ; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; CHECK-AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -122,16 +122,29 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_srem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_srem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } @@ -154,17 +167,31 @@ ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_srem_sdiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_srem_sdiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_srem_sdiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [95,95,95,95,95,95,95,95] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX2-NEXT: vpmulhw %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpaddw %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $15, %xmm2, %xmm3 +; AVX2-NEXT: vpsraw $6, %xmm2, %xmm2 +; AVX2-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -530,14 +530,15 @@ ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +546,15 @@ ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,15 +562,16 @@ ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -589,19 +592,22 @@ ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v16i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogq $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpternlogq $96, %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -1304,8 +1310,11 @@ ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775807,9223372036854775807] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1316,8 +1325,11 @@ ; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775807,9223372036854775807] +; AVX512F-NEXT: # xmm2 = mem[0,0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm3 = mem[0,0] +; AVX512F-NEXT: vblendvpd %xmm1, %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; @@ -1329,8 +1341,8 @@ ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k1 ; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512BW-NEXT: vmovdqa64 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -490,14 +490,32 @@ ; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -906,7 +924,7 @@ ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -173,7 +173,8 @@ ; ; CHECK-AVX2-LABEL: t2_narrow: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [43691,43691,43691,43691,43691,43691,43691,43691] +; CHECK-AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -182,7 +183,8 @@ ; ; CHECK-AVX512VL-LABEL: t2_narrow: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [43691,43691,43691,43691,43691,43691,43691,43691] +; CHECK-AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -240,15 +242,17 @@ ; ; CHECK-AVX2-LABEL: t3_wide: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2863311530,2863311530] +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12297829382473034411,12297829382473034411] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -257,11 +261,11 @@ ; ; CHECK-AVX512VL-LABEL: t3_wide: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] ; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -96,13 +96,23 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -121,14 +131,25 @@ ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_urem_udiv: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_urem_udiv: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_urem_udiv: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [95,95,95,95,95,95,95,95] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX2-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $6, %xmm2, %xmm2 +; AVX2-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, %3 = add <4 x i16> %1, %2 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -489,13 +489,29 @@ ; SSE-NEXT: psubusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -515,19 +531,22 @@ ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v16i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v16i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpternlogq $96, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpternlogq $96, %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z @@ -817,7 +836,7 @@ ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1112,7 +1131,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi) @@ -1124,7 +1143,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) @@ -1135,7 +1154,8 @@ ; AVX512BW-LABEL: PR48223: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -2,13 +2,13 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { @@ -185,12 +185,37 @@ ; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVXNOVLBW-LABEL: var_shuffle_v8i16: -; AVXNOVLBW: # %bb.0: -; AVXNOVLBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVXNOVLBW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVXNOVLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVXNOVLBW-NEXT: retq +; XOP-LABEL: var_shuffle_v8i16: +; XOP: # %bb.0: +; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX1-LABEL: var_shuffle_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: var_shuffle_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] +; AVX512-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256] +; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v8i16: ; AVX512VL: # %bb.0: @@ -1014,36 +1039,26 @@ ; ; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1 -; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} -; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512VLBW-NEXT: vzeroupper -; AVX512VLBW-NEXT: retq -; ; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; VLVBMI: # %bb.0: ; VLVBMI-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 @@ -1051,6 +1066,18 @@ ; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; VLVBMI-NEXT: vzeroupper ; VLVBMI-NEXT: retq +; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 +; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: retq %index0 = extractelement <16 x i8> %indices, i32 0 %index1 = extractelement <16 x i8> %indices, i32 1 %index2 = extractelement <16 x i8> %indices, i32 2 diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -168,38 +168,47 @@ ; ; AVX2-LABEL: var_shuffle_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v16i16: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX512VLDQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512VLDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLDQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm3 +; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v16i16: @@ -290,40 +299,44 @@ ; ; AVX2-LABEL: var_shuffle_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v32i8: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLDQ-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v32i8: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq @@ -713,37 +726,46 @@ ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] +; AVX512VLDQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512VLDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLDQ-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512VLDQ-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: @@ -835,39 +857,43 @@ ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLDQ-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm2 +; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpcmpgtb %ymm2, %ymm1, %k1 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1 ; AVX512VLBW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec-copysign-avx512.ll b/llvm/test/CodeGen/X86/vec-copysign-avx512.ll --- a/llvm/test/CodeGen/X86/vec-copysign-avx512.ll +++ b/llvm/test/CodeGen/X86/vec-copysign-avx512.ll @@ -32,7 +32,7 @@ define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind { ; CHECK-LABEL: v2f64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; CHECK-NEXT: retq %tmp = tail call <2 x double> @llvm.copysign.v2f64( <2 x double> %a, <2 x double> %b ) ret <2 x double> %tmp diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i1(<8 x i1>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1>, metadata, metadata) @@ -71,17 +71,12 @@ } define <8 x half> @uitofp_v8i1_v8f16(<8 x i1> %x) #0 { -; X86-LABEL: uitofp_v8i1_v8f16: -; X86: # %bb.0: -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vcvtuw2ph %xmm0, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: uitofp_v8i1_v8f16: -; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vcvtuw2ph %xmm0, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: uitofp_v8i1_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i1(<16 x i1>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1>, metadata, metadata) @@ -28,19 +28,13 @@ } define <16 x half> @uitofp_v16i1_v16f16(<16 x i1> %x) #0 { -; X86-LABEL: uitofp_v16i1_v16f16: -; X86: # %bb.0: -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X86-NEXT: vcvtuw2ph %ymm0, %ymm0 -; X86-NEXT: retl -; -; X64-LABEL: uitofp_v16i1_v16f16: -; X64: # %bb.0: -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-NEXT: vcvtuw2ph %ymm0, %ymm0 -; X64-NEXT: retq +; CHECK-LABEL: uitofp_v16i1_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: vcvtuw2ph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX1,AVX-32,AVX1-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX1,AVX1-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 -O3 | FileCheck %s --check-prefixes=CHECK,AVX2,AVX-32,AVX2-32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 -O3 | FileCheck %s --check-prefixes=CHECK,AVX2,AVX-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -O3 | FileCheck %s --check-prefixes=CHECK,AVX2,AVX2-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX-32,AVX512F-32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX512F-64 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX-32,AVX512VL-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQVL,AVX512DQVL-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQVL,AVX512DQVL-64 @@ -112,75 +112,45 @@ ; AVX1-64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-32-LABEL: uitofp_v8i1_v8f32: -; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-32-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-32-NEXT: retl +; AVX2-LABEL: uitofp_v8i1_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} ; -; AVX2-64-LABEL: uitofp_v8i1_v8f32: -; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-64-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-64-NEXT: retq +; AVX512F-LABEL: uitofp_v8i1_v8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512F-NEXT: ret{{[l|q]}} ; -; AVX512F-32-LABEL: uitofp_v8i1_v8f32: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-32-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-32-NEXT: retl +; AVX512VL-LABEL: uitofp_v8i1_v8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} ; -; AVX512F-64-LABEL: uitofp_v8i1_v8f32: -; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-64-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-64-NEXT: retq +; AVX512DQ-LABEL: uitofp_v8i1_v8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512DQ-NEXT: ret{{[l|q]}} ; -; AVX512VL-32-LABEL: uitofp_v8i1_v8f32: -; AVX512VL-32: # %bb.0: -; AVX512VL-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-32-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512VL-32-NEXT: retl -; -; AVX512VL-64-LABEL: uitofp_v8i1_v8f32: -; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-64-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512VL-64-NEXT: retq -; -; AVX512DQ-32-LABEL: uitofp_v8i1_v8f32: -; AVX512DQ-32: # %bb.0: -; AVX512DQ-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-32-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512DQ-32-NEXT: retl -; -; AVX512DQ-64-LABEL: uitofp_v8i1_v8f32: -; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-64-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512DQ-64-NEXT: retq -; -; AVX512DQVL-32-LABEL: uitofp_v8i1_v8f32: -; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; AVX512DQVL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-32-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512DQVL-32-NEXT: retl -; -; AVX512DQVL-64-LABEL: uitofp_v8i1_v8f32: -; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQVL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-64-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512DQVL-64-NEXT: retq +; AVX512DQVL-LABEL: uitofp_v8i1_v8f32: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512DQVL-NEXT: ret{{[l|q]}} %result = call <8 x float> @llvm.experimental.constrained.uitofp.v8f32.v8i1(<8 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i1(<32 x i1>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1>, metadata, metadata) @@ -28,19 +28,13 @@ } define <32 x half> @uitofp_v32i1_v32f16(<32 x i1> %x) #0 { -; X86-LABEL: uitofp_v32i1_v32f16: -; X86: # %bb.0: -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X86-NEXT: vcvtuw2ph %zmm0, %zmm0 -; X86-NEXT: retl -; -; X64-LABEL: uitofp_v32i1_v32f16: -; X64: # %bb.0: -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X64-NEXT: vcvtuw2ph %zmm0, %zmm0 -; X64-NEXT: retq +; CHECK-LABEL: uitofp_v32i1_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-NEXT: vcvtuw2ph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,NODQ-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,NODQ-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f,avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,DQ,DQ-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,DQ,DQ-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f,avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,DQ declare <16 x float> @llvm.experimental.constrained.sitofp.v16f32.v16i1(<16 x i1>, metadata, metadata) declare <16 x float> @llvm.experimental.constrained.uitofp.v16f32.v16i1(<16 x i1>, metadata, metadata) @@ -40,33 +40,13 @@ } define <16 x float> @uitofp_v16i1_v16f32(<16 x i1> %x) #0 { -; NODQ-32-LABEL: uitofp_v16i1_v16f32: -; NODQ-32: # %bb.0: -; NODQ-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; NODQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; NODQ-32-NEXT: vcvtdq2ps %zmm0, %zmm0 -; NODQ-32-NEXT: retl -; -; NODQ-64-LABEL: uitofp_v16i1_v16f32: -; NODQ-64: # %bb.0: -; NODQ-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; NODQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; NODQ-64-NEXT: vcvtdq2ps %zmm0, %zmm0 -; NODQ-64-NEXT: retq -; -; DQ-32-LABEL: uitofp_v16i1_v16f32: -; DQ-32: # %bb.0: -; DQ-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; DQ-32-NEXT: vcvtdq2ps %zmm0, %zmm0 -; DQ-32-NEXT: retl -; -; DQ-64-LABEL: uitofp_v16i1_v16f32: -; DQ-64: # %bb.0: -; DQ-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; DQ-64-NEXT: vcvtdq2ps %zmm0, %zmm0 -; DQ-64-NEXT: retq +; CHECK-LABEL: uitofp_v16i1_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %result = call <16 x float> @llvm.experimental.constrained.uitofp.v16f32.v16i1(<16 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 @@ -158,33 +138,13 @@ } define <8 x double> @uitofp_v8i1_v8f64(<8 x i1> %x) #0 { -; NODQ-32-LABEL: uitofp_v8i1_v8f64: -; NODQ-32: # %bb.0: -; NODQ-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; NODQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; NODQ-32-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NODQ-32-NEXT: retl -; -; NODQ-64-LABEL: uitofp_v8i1_v8f64: -; NODQ-64: # %bb.0: -; NODQ-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; NODQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; NODQ-64-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NODQ-64-NEXT: retq -; -; DQ-32-LABEL: uitofp_v8i1_v8f64: -; DQ-32: # %bb.0: -; DQ-32-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; DQ-32-NEXT: vcvtdq2pd %ymm0, %zmm0 -; DQ-32-NEXT: retl -; -; DQ-64-LABEL: uitofp_v8i1_v8f64: -; DQ-64: # %bb.0: -; DQ-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; DQ-64-NEXT: vcvtdq2pd %ymm0, %zmm0 -; DQ-64-NEXT: retq +; CHECK-LABEL: uitofp_v8i1_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %result = call <8 x double> @llvm.experimental.constrained.uitofp.v8f64.v8i1(<8 x i1> %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -342,7 +342,7 @@ ; ; AVX2-LABEL: ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -516,7 +516,7 @@ ; ; AVX2-LABEL: gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -764,7 +764,7 @@ ; ; AVX2-LABEL: le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -939,7 +939,7 @@ ; ; AVX2-LABEL: lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -7,15 +7,35 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VLDQ define <2 x double> @fabs_v2f64(<2 x double> %p) { -; X86-LABEL: fabs_v2f64: -; X86: # %bb.0: -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: retl +; X86-AVX-LABEL: fabs_v2f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; -; X64-LABEL: fabs_v2f64: -; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: retq +; X86-AVX512VL-LABEL: fabs_v2f64: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fabs_v2f64: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 +; X86-AVX512VLDQ-NEXT: retl +; +; X64-AVX-LABEL: fabs_v2f64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: fabs_v2f64: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fabs_v2f64: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; X64-AVX512VLDQ-NEXT: retq %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p) ret <2 x double> %t } diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -344,15 +344,27 @@ ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 -; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [-2.147483648E+9,-2.147483648E+9] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttpd2dq %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_4i32: ; AVX512F: # %bb.0: @@ -396,15 +408,27 @@ ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i32: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 -; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [-2.147483648E+9,-2.147483648E+9] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttpd2dq %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i32: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -52,14 +52,23 @@ ; SSE41-NEXT: cvtpd2ps %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f32: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vcvtpd2ps %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f32: ; AVX512F: # %bb.0: @@ -604,10 +613,14 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -615,10 +628,14 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: # xmm2 = mem[0,0] +; AVX512F-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; @@ -626,10 +643,10 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; @@ -667,13 +684,21 @@ ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -1908,23 +1933,42 @@ ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; VEX-NEXT: vpsrlq $1, %xmm0, %xmm2 -; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; VEX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; VEX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_4f32: ; AVX512F: # %bb.0: @@ -2023,24 +2067,44 @@ ; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm3[0],zero ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i64_to_2f32: -; VEX: # %bb.0: -; VEX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; VEX-NEXT: vpsrlq $1, %xmm0, %xmm2 -; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; VEX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; VEX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i64_to_2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i64_to_2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_2f32: ; AVX512F: # %bb.0: @@ -3276,10 +3340,14 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3288,10 +3356,14 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: # xmm2 = mem[0,0] +; AVX512F-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; @@ -3300,10 +3372,10 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VL-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; @@ -3343,13 +3415,21 @@ ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -5658,15 +5738,17 @@ ; ; AVX2-LABEL: PR43609: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: # xmm6 = mem[0,0] ; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5675,7 +5757,8 @@ ; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovupd %xmm0, (%rdi) @@ -5684,15 +5767,17 @@ ; ; AVX512F-LABEL: PR43609: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: # xmm6 = mem[0,0] ; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5701,7 +5786,8 @@ ; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: # xmm2 = mem[0,0] ; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vmovupd %xmm0, (%rdi) @@ -5710,15 +5796,16 @@ ; ; AVX512VL-LABEL: PR43609: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512VL-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512VL-NEXT: # xmm6 = mem[0,0] ; AVX512VL-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX512VL-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5727,7 +5814,8 @@ ; AVX512VL-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX512VL-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VL-NEXT: # xmm2 = mem[0,0] ; AVX512VL-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovupd %xmm0, (%rdi) @@ -5737,10 +5825,12 @@ ; AVX512DQ-LABEL: PR43609: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX512DQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: # xmm2 = mem[0,0] ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovupd %xmm0, (%rdi) @@ -5750,10 +5840,11 @@ ; ; AVX512VLDQ-LABEL: PR43609: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512VLDQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 ; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VLDQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512VLDQ-NEXT: # xmm2 = mem[0,0] ; AVX512VLDQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512VLDQ-NEXT: vmovupd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -71,7 +71,7 @@ ; ; AVX2-LABEL: max_gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -477,7 +477,7 @@ ; ; AVX2-LABEL: max_ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -882,7 +882,7 @@ ; ; AVX2-LABEL: min_lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1290,7 +1290,7 @@ ; ; AVX2-LABEL: min_le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1437,7 +1437,8 @@ ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -1832,7 +1833,7 @@ ; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm4 @@ -2562,7 +2563,7 @@ ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm3 ; AVX2-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 @@ -2682,7 +2683,7 @@ ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm4 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -855,16 +855,27 @@ ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: uaddo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: uaddo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1229,7 +1229,8 @@ ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1585,7 +1586,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] @@ -2246,7 +2247,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] ; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] @@ -2352,7 +2353,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -902,16 +902,27 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: usubo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: usubo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: usubo_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -447,18 +447,44 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v16i8: ; XOP: # %bb.0: @@ -470,10 +496,22 @@ ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: test_bitreverse_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: test_bitreverse_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: test_bitreverse_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ret <16 x i8> %b } @@ -524,19 +562,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v8i16: ; XOP: # %bb.0: @@ -549,11 +615,25 @@ ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: test_bitreverse_v8i16: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: test_bitreverse_v8i16: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v8i16: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: test_bitreverse_v8i16: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %b } @@ -609,19 +689,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v4i32: ; XOP: # %bb.0: @@ -634,11 +742,25 @@ ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: test_bitreverse_v4i32: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: test_bitreverse_v4i32: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v4i32: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: test_bitreverse_v4i32: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b } @@ -696,19 +818,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v2i64: ; XOP: # %bb.0: @@ -721,11 +871,25 @@ ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: test_bitreverse_v2i64: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1-LABEL: test_bitreverse_v2i64: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: test_bitreverse_v2i64: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq +; +; GFNIAVX512-LABEL: test_bitreverse_v2i64: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] +; GFNIAVX512-NEXT: vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b } @@ -822,7 +986,7 @@ ; ; AVX2-LABEL: test_bitreverse_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -835,7 +999,7 @@ ; ; AVX512-LABEL: test_bitreverse_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1002,7 +1166,7 @@ ; AVX2-LABEL: test_bitreverse_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1016,7 +1180,7 @@ ; AVX512-LABEL: test_bitreverse_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1200,7 +1364,7 @@ ; AVX2-LABEL: test_bitreverse_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1214,7 +1378,7 @@ ; AVX512-LABEL: test_bitreverse_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1402,7 +1566,7 @@ ; AVX2-LABEL: test_bitreverse_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1416,7 +1580,7 @@ ; AVX512-LABEL: test_bitreverse_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1645,7 +1809,7 @@ ; ; AVX2-LABEL: test_bitreverse_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -1665,7 +1829,7 @@ ; AVX512F-LABEL: test_bitreverse_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -1685,7 +1849,7 @@ ; ; AVX512BW-LABEL: test_bitreverse_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1958,7 +2122,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -1981,7 +2145,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2003,7 +2167,7 @@ ; AVX512BW-LABEL: test_bitreverse_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2310,7 +2474,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2333,7 +2497,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2355,7 +2519,7 @@ ; AVX512BW-LABEL: test_bitreverse_v16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2670,7 +2834,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2693,7 +2857,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2715,7 +2879,7 @@ ; AVX512BW-LABEL: test_bitreverse_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1034,16 +1034,27 @@ ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; -; AVX-LABEL: bool_reduction_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovmskpd %xmm0, %eax -; AVX-NEXT: cmpb $3, %al -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1-LABEL: bool_reduction_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: cmpb $3, %al +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: cmpb $3, %al +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq ; ; AVX512-LABEL: bool_reduction_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -989,16 +989,27 @@ ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX-LABEL: bool_reduction_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovmskpd %xmm0, %eax -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: setne %al -; AVX-NEXT: retq +; AVX1-LABEL: bool_reduction_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: setne %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: bool_reduction_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: retq ; ; AVX512-LABEL: bool_reduction_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -32,11 +32,19 @@ ; CHECK-NEXT: divpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fdiv_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] -; AVX-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fdiv_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] +; AVX1-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fdiv_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [1.0E+1,1.0E+1] +; AVX512-NEXT: # xmm0 = mem[0,0] +; AVX512-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0] +; AVX512-NEXT: vdivpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq entry: %div = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64( <2 x double> , @@ -95,14 +103,25 @@ ; CHECK-NEXT: wait ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fdiv_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0] -; AVX-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fdiv_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0] +; AVX1-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fdiv_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [1.0E+1,1.0E+1] +; AVX512-NEXT: # xmm0 = mem[0,0] +; AVX512-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0] +; AVX512-NEXT: vdivpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64( <3 x double> , @@ -429,11 +448,18 @@ ; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fmul_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fmul_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fmul_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX512-NEXT: # xmm0 = mem[0,0] +; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %mul = call <2 x double> @llvm.experimental.constrained.fmul.v2f64( <2 x double> , @@ -489,14 +515,24 @@ ; CHECK-NEXT: wait ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fmul_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fmul_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fmul_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq entry: %mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64( <3 x double> @llvm.experimental.constrained.fadd.v2f64( <2 x double> , @@ -627,14 +670,24 @@ ; CHECK-NEXT: wait ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fadd_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fadd_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fadd_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq entry: %add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64( <3 x double> @llvm.experimental.constrained.fsub.v2f64( <2 x double> , @@ -767,15 +827,26 @@ ; CHECK-NEXT: wait ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fsub_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fsub_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fsub_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq entry: %sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64( <3 x double> %x to <8 x i32> @@ -322,8 +324,9 @@ ; ; AVX2-LABEL: bool_zext_xor: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq %xz = zext <8 x i1> %x to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -84,7 +84,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -95,7 +95,7 @@ ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -106,7 +106,7 @@ ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -117,7 +117,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -138,7 +138,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -167,7 +167,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -506,7 +506,8 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 @@ -521,7 +522,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 @@ -536,7 +538,8 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 @@ -547,7 +550,7 @@ ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -569,7 +572,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 @@ -583,18 +586,31 @@ ; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: @@ -756,7 +772,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero @@ -779,17 +796,18 @@ ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -797,17 +815,18 @@ ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -815,7 +834,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero @@ -834,7 +854,8 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -849,7 +870,8 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -863,7 +885,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm0, %ymm3, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -871,19 +894,33 @@ ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: @@ -952,20 +989,31 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -976,7 +1024,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -987,7 +1035,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1008,7 +1056,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1023,16 +1071,27 @@ ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -2343,14 +2402,25 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_funnnel_v16i8: ; AVX512F: # %bb.0: @@ -2365,7 +2435,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: @@ -2390,22 +2460,31 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatconstant_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatconstant_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: ; X86-SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -354,7 +354,8 @@ ; AVX2-LABEL: var_funnnel_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3 @@ -372,7 +373,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 @@ -385,7 +387,8 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $16, %zmm0, %zmm0 @@ -395,7 +398,7 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -415,7 +418,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 @@ -451,7 +454,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 @@ -545,7 +548,8 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] ; AVX2-NEXT: vpsllvd %ymm7, %ymm5, %ymm5 @@ -573,69 +577,77 @@ ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpandn %ymm5, %ymm2, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpandn %ymm5, %ymm2, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 +; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -645,7 +657,8 @@ ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -658,7 +671,8 @@ ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -671,7 +685,8 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -684,7 +699,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -717,9 +733,10 @@ ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; XOPAVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7 ; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -761,7 +778,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -772,7 +789,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -783,7 +800,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -794,7 +811,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -814,7 +831,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -849,7 +866,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -2265,9 +2282,11 @@ ; AVX2-LABEL: splatconstant_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2335,7 +2354,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; XOPAVX2-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> ) ret <32 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -138,7 +138,9 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpsrld $16, %zmm3, %zmm3 @@ -163,7 +165,9 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512VL-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpsrld $16, %zmm3, %zmm3 @@ -184,7 +188,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -200,7 +204,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -220,52 +224,53 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 +; AVX512F-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9 -; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 -; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7 -; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm7 -; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpxor %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-NEXT: vpsrlw $1, %ymm6, %ymm6 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsrlw $4, %ymm6, %ymm8 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpsrlw $2, %ymm6, %ymm8 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm10, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm6 +; AVX512F-NEXT: vpsrlw $1, %ymm6, %ymm8 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm5 +; AVX512F-NEXT: vpxor %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm7 -; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm6 -; AVX512F-NEXT: vpxor %ymm2, %ymm8, %ymm7 -; AVX512F-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 ; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 -; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512F-NEXT: vpand %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 @@ -289,52 +294,53 @@ ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm5 -; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 +; AVX512VL-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9 -; AVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpand %ymm7, %ymm10, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7 -; AVX512VL-NEXT: vpand %ymm4, %ymm7, %ymm7 -; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpxor %ymm4, %ymm3, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512VL-NEXT: vpsrlw $1, %ymm6, %ymm6 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsrlw $4, %ymm6, %ymm8 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm8 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm10, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsrlw $1, %ymm6, %ymm8 +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm8, %ymm6, %ymm5 +; AVX512VL-NEXT: vpxor %ymm4, %ymm2, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm7 -; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm6 -; AVX512VL-NEXT: vpxor %ymm2, %ymm8, %ymm7 -; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 ; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 -; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 @@ -359,7 +365,8 @@ ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -374,7 +381,8 @@ ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -389,7 +397,8 @@ ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -404,7 +413,8 @@ ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm3, %zmm3 @@ -426,7 +436,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -437,7 +447,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -448,7 +458,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -465,7 +475,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -87,7 +87,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -366,8 +366,9 @@ ; ; AVX2-LABEL: var_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -384,11 +385,12 @@ ; ; AVX512F-LABEL: var_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -400,11 +402,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -416,9 +419,10 @@ ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -427,9 +431,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -560,36 +565,66 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -602,7 +637,8 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -612,7 +648,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -625,7 +662,8 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -640,7 +678,8 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -654,7 +693,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -731,17 +771,29 @@ ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -1812,14 +1864,25 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_funnnel_v16i8: ; AVX512F: # %bb.0: @@ -1834,7 +1897,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: @@ -1850,7 +1913,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: @@ -1866,7 +1929,7 @@ ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -272,7 +272,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 @@ -281,7 +282,7 @@ ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 @@ -295,11 +296,12 @@ ; ; AVX512F-LABEL: var_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -309,11 +311,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -324,9 +327,10 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -334,9 +338,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -430,21 +435,26 @@ ; AVX2-LABEL: var_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -464,7 +474,8 @@ ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -485,14 +496,16 @@ ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpternlogq $234, %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -503,7 +516,8 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -517,7 +531,8 @@ ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -529,7 +544,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -587,7 +603,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -1625,9 +1641,11 @@ ; AVX2-LABEL: splatconstant_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -36,12 +36,12 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -63,12 +63,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -90,9 +90,10 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -100,9 +101,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -139,7 +141,7 @@ ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 @@ -182,7 +184,7 @@ ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 @@ -206,7 +208,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -221,7 +224,8 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -236,7 +240,8 @@ ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -251,7 +256,8 @@ ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -84,7 +84,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -95,7 +95,7 @@ ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -106,7 +106,7 @@ ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -117,7 +117,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -138,7 +138,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -168,7 +168,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -565,7 +565,8 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] @@ -580,7 +581,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -594,7 +596,8 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -604,7 +607,7 @@ ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -626,7 +629,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -641,18 +644,31 @@ ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 -; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: @@ -825,41 +841,78 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5 -; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX2-NEXT: vpsllw $5, %xmm5, %xmm5 +; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpandn %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -876,7 +929,7 @@ ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -893,7 +946,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero @@ -911,7 +965,8 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -925,7 +980,8 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm1, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 @@ -938,25 +994,39 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: @@ -1041,20 +1111,31 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq $1, %xmm0, %xmm0 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1065,7 +1146,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1076,7 +1157,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1097,7 +1178,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1113,16 +1194,27 @@ ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq $1, %xmm0, %xmm0 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1376,25 +1468,38 @@ ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1407,7 +1512,7 @@ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1420,7 +1525,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1937,17 +2042,18 @@ ; ; AVX2-LABEL: constant_funnnel_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2295,14 +2401,25 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_funnnel_v16i8: ; AVX512F: # %bb.0: @@ -2317,7 +2434,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: @@ -2342,22 +2459,31 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatconstant_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatconstant_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: ; X86-SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -384,7 +384,8 @@ ; AVX2-LABEL: var_funnnel_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 @@ -402,7 +403,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 @@ -414,7 +416,8 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -423,7 +426,7 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -443,7 +446,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -483,7 +486,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 @@ -576,97 +579,112 @@ ; ; AVX2-LABEL: var_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm4 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm4 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -676,7 +694,8 @@ ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -688,7 +707,8 @@ ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm3, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -700,7 +720,8 @@ ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 @@ -712,7 +733,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm0, %zmm3, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -743,25 +765,25 @@ ; ; XOPAVX2-LABEL: var_funnnel_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm5 -; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 -; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOPAVX2-NEXT: vpxor %xmm6, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm6 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; XOPAVX2-NEXT: vpshlb %xmm6, %xmm7, %xmm6 +; XOPAVX2-NEXT: vpxor %xmm3, %xmm4, %xmm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm7 ; XOPAVX2-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; XOPAVX2-NEXT: vpshlb %xmm3, %xmm7, %xmm3 -; XOPAVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm6, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm7, %xmm4 +; XOPAVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpshlb %xmm5, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ret <32 x i8> %res @@ -792,7 +814,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -803,7 +825,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -814,7 +836,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -825,7 +847,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -845,7 +867,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -881,7 +903,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1163,7 +1185,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1176,7 +1198,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1189,7 +1211,7 @@ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1202,7 +1224,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1227,7 +1249,7 @@ ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1654,7 +1676,7 @@ ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1676,7 +1698,7 @@ ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1698,7 +1720,7 @@ ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -2066,9 +2088,11 @@ ; AVX2-LABEL: splatconstant_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2136,7 +2160,8 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; XOPAVX2-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> ) ret <32 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -142,7 +142,9 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 @@ -165,7 +167,9 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 ; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512VL-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 @@ -184,7 +188,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -201,7 +205,7 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -222,52 +226,53 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 +; AVX512F-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8 -; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 -; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 -; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpxor %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm6, %ymm7 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm5 +; AVX512F-NEXT: vpxor %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 -; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm6 -; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm6 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 @@ -289,52 +294,53 @@ ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 +; AVX512VL-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm8 -; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 -; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 -; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpxor %ymm4, %ymm3, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $4, %ymm6, %ymm7 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm7, %ymm9, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm5 +; AVX512VL-NEXT: vpxor %ymm4, %ymm2, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 -; AVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm6 -; AVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 @@ -357,11 +363,12 @@ ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -373,7 +380,8 @@ ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 @@ -387,11 +395,12 @@ ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -403,7 +412,8 @@ ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 @@ -424,7 +434,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -435,7 +445,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -446,7 +456,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -464,7 +474,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -628,7 +638,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -651,7 +661,7 @@ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -672,7 +682,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 @@ -696,7 +706,7 @@ ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 @@ -911,7 +921,7 @@ ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] @@ -960,7 +970,7 @@ ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] @@ -984,7 +994,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 @@ -1006,7 +1016,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -87,7 +87,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -384,13 +384,14 @@ ; ; AVX2-LABEL: var_funnnel_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -402,11 +403,12 @@ ; ; AVX512F-LABEL: var_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -418,11 +420,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -434,9 +437,10 @@ ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -445,9 +449,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -582,38 +587,70 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpsubb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 @@ -625,7 +662,8 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -634,7 +672,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -646,7 +685,8 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -660,7 +700,8 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -673,7 +714,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 @@ -753,17 +795,29 @@ ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -1089,25 +1143,38 @@ ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1120,7 +1187,7 @@ ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1133,7 +1200,7 @@ ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1858,14 +1925,25 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_funnnel_v16i8: ; AVX512F: # %bb.0: @@ -1880,7 +1958,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: @@ -1896,7 +1974,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: @@ -1912,7 +1990,7 @@ ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -288,7 +288,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm4 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 @@ -297,7 +298,7 @@ ; AVX2-NEXT: vpsrlvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3 @@ -311,11 +312,12 @@ ; ; AVX512F-LABEL: var_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 @@ -325,11 +327,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 @@ -340,9 +343,10 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -350,9 +354,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -454,23 +459,28 @@ ; AVX2-LABEL: var_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsubb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -517,7 +527,8 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -527,7 +538,8 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -540,7 +552,8 @@ ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -551,7 +564,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -613,7 +627,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -929,7 +943,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -942,7 +956,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -955,7 +969,7 @@ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -968,7 +982,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -981,7 +995,7 @@ ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -1671,9 +1685,11 @@ ; AVX2-LABEL: splatconstant_funnnel_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -36,12 +36,12 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsllvd %zmm3, %zmm5, %zmm3 @@ -63,12 +63,12 @@ ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm5, %zmm3 @@ -90,9 +90,10 @@ ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -100,9 +101,10 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -204,12 +206,13 @@ ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -220,12 +223,13 @@ ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -236,7 +240,8 @@ ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -250,7 +255,8 @@ ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -371,7 +377,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -393,7 +399,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -414,7 +420,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 @@ -427,7 +433,7 @@ ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -154,13 +154,22 @@ ; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_div7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_div7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm1 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_div7_8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm1 +; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %res = sdiv <8 x i16> %a, ret <8 x i16> %res } @@ -215,18 +224,21 @@ ; AVX2NOBW-LABEL: test_div7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq @@ -234,17 +246,20 @@ ; AVX512BW-LABEL: test_div7_16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX512BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -327,7 +342,8 @@ ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq @@ -345,7 +361,8 @@ ; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -540,15 +557,27 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 -; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_rem7_8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX2-NEXT: vpsraw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %res = srem <8 x i16> %a, ret <8 x i16> %res } @@ -612,21 +641,25 @@ ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper @@ -635,20 +668,24 @@ ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX512BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -791,11 +828,13 @@ ; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -815,7 +854,8 @@ ; AVX512BW-NEXT: vpsravw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -146,7 +146,8 @@ ; ; AVX2-LABEL: test_div7_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1 ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 @@ -201,7 +202,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -209,30 +210,35 @@ ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = sdiv <32 x i8> %a, @@ -316,7 +322,8 @@ ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2NOBW-NEXT: retq ; @@ -329,7 +336,8 @@ ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -529,11 +537,13 @@ ; ; AVX2-LABEL: test_rem7_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = srem <16 x i16> %a, @@ -595,7 +605,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -603,16 +613,19 @@ ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq @@ -620,20 +633,24 @@ ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -735,11 +752,12 @@ ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -757,7 +775,8 @@ ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 ; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -103,7 +103,7 @@ ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1 @@ -117,7 +117,8 @@ ; ; AVX512BW-LABEL: test_div7_32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 @@ -132,7 +133,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -141,12 +142,12 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 @@ -172,7 +173,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -181,12 +182,14 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpternlogq $108, %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsubb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = sdiv <64 x i8> %a, ret <64 x i8> %res @@ -221,7 +224,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] @@ -269,7 +272,8 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %res = sdiv <64 x i8> %a, @@ -411,7 +415,7 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 @@ -431,11 +435,13 @@ ; ; AVX512BW-LABEL: test_rem7_32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2 ; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = srem <32 x i16> %a, @@ -448,7 +454,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -457,17 +463,17 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -497,7 +503,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -506,14 +512,17 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpternlogq $108, %zmm2, %zmm4, %zmm3 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpsubb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpsubb %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -550,12 +559,12 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm7, %ymm7 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 @@ -615,11 +624,12 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -151,14 +151,24 @@ ; SSE-NEXT: psrlw $2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_div7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_div7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_div7_8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX2-NEXT: retq %res = udiv <8 x i16> %a, ret <8 x i16> %res } @@ -227,31 +237,37 @@ ; AVX2NOBW-LABEL: test_div7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2NOBW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_div7_16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = udiv <16 x i8> %a, @@ -612,16 +628,29 @@ ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_rem7_8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX2-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res } @@ -706,18 +735,22 @@ ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2NOBW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper @@ -726,17 +759,21 @@ ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -901,7 +938,8 @@ ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -151,7 +151,8 @@ ; ; AVX2-LABEL: test_div7_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 @@ -202,7 +203,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -211,24 +212,29 @@ ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = udiv <32 x i8> %a, ret <32 x i8> %res @@ -562,12 +568,14 @@ ; ; AVX2-LABEL: test_rem7_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = urem <16 x i16> %a, @@ -624,7 +632,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -633,12 +641,15 @@ ; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq @@ -646,17 +657,21 @@ ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -777,7 +792,7 @@ ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -103,7 +103,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 @@ -120,7 +120,8 @@ ; ; AVX512BW-LABEL: test_div7_32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 @@ -135,7 +136,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -144,7 +145,7 @@ ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 @@ -162,14 +163,16 @@ ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -178,10 +181,12 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = udiv <64 x i8> %a, ret <64 x i8> %res @@ -431,7 +436,7 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 @@ -453,12 +458,14 @@ ; ; AVX512BW-LABEL: test_rem7_32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = urem <32 x i16> %a, @@ -471,7 +478,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] @@ -480,14 +487,14 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 ; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -515,7 +522,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -524,12 +531,15 @@ ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -570,7 +580,7 @@ ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 @@ -641,7 +651,7 @@ ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm1[0],zmm2[1],zmm1[1],zmm2[2],zmm1[2],zmm2[3],zmm1[3],zmm2[4],zmm1[4],zmm2[5],zmm1[5],zmm2[6],zmm1[6],zmm2[7],zmm1[7],zmm2[16],zmm1[16],zmm2[17],zmm1[17],zmm2[18],zmm1[18],zmm2[19],zmm1[19],zmm2[20],zmm1[20],zmm2[21],zmm1[21],zmm2[22],zmm1[22],zmm2[23],zmm1[23],zmm2[32],zmm1[32],zmm2[33],zmm1[33],zmm2[34],zmm1[34],zmm2[35],zmm1[35],zmm2[36],zmm1[36],zmm2[37],zmm1[37],zmm2[38],zmm1[38],zmm2[39],zmm1[39],zmm2[48],zmm1[48],zmm2[49],zmm1[49],zmm2[50],zmm1[50],zmm2[51],zmm1[51],zmm2[52],zmm1[52],zmm2[53],zmm1[53],zmm2[54],zmm1[54],zmm2[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -149,21 +149,37 @@ ; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX-LABEL: load_i8_stride2_vf16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX-NEXT: vpand %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: load_i8_stride2_vf16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdx) +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i8_stride2_vf16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm1, (%rdx) +; AVX2-NEXT: retq ; ; AVX512-LABEL: load_i8_stride2_vf16: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=AVX512VLBWDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512VLCD ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512CD @@ -158,40 +158,70 @@ ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv2i64: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -408,40 +438,70 @@ ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv2i64u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -650,35 +710,60 @@ ; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv4i32: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -876,35 +961,60 @@ ; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv4i32u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv4i32u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1078,30 +1188,50 @@ ; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv8i16: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1119,7 +1249,8 @@ ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLCD-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLCD-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512VLCD-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512VLCD-NEXT: vzeroupper ; AVX512VLCD-NEXT: retq ; @@ -1128,7 +1259,8 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; @@ -1268,30 +1400,50 @@ ; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv8i16u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 ; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 @@ -1309,7 +1461,8 @@ ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLCD-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLCD-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512VLCD-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512VLCD-NEXT: vzeroupper ; AVX512VLCD-NEXT: retq ; @@ -1318,7 +1471,8 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; @@ -1442,25 +1596,40 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -1473,7 +1642,8 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1592,25 +1762,40 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: testv16i8u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpand %xmm3, %xmm2, %xmm2 @@ -1623,7 +1808,8 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -68,7 +68,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -96,7 +97,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -124,7 +126,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -164,7 +167,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -250,7 +254,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -278,7 +283,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -306,7 +312,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -346,7 +353,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -422,7 +430,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -445,7 +454,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -468,7 +478,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -503,7 +514,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -574,7 +586,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -597,7 +610,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -620,7 +634,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -655,7 +670,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -716,7 +732,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -734,7 +751,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -752,7 +770,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -770,7 +789,8 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv16i16: @@ -778,7 +798,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -833,7 +854,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -851,7 +873,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -869,7 +892,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -887,7 +911,8 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv16i16u: @@ -895,7 +920,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm4, %ymm3, %ymm3 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 @@ -940,7 +966,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -953,7 +980,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -966,7 +994,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -980,7 +1009,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 @@ -994,7 +1023,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm3, %ymm0, %ymm0 ; X32-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X32-AVX-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1034,7 +1064,8 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1047,7 +1078,8 @@ ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1060,7 +1092,8 @@ ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBWDQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1074,7 +1107,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 @@ -1088,7 +1121,8 @@ ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm3, %ymm0, %ymm0 ; X32-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X32-AVX-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -29,7 +29,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -56,7 +56,7 @@ ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -106,7 +106,7 @@ ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -133,7 +133,7 @@ ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -181,7 +181,7 @@ ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -210,7 +210,7 @@ ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -266,7 +266,7 @@ ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -295,7 +295,7 @@ ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -335,7 +335,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -350,7 +350,7 @@ ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CDBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 @@ -361,15 +361,16 @@ ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 +; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -384,7 +385,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -421,7 +422,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -436,7 +437,7 @@ ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CDBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 @@ -447,15 +448,16 @@ ; ; AVX512BW-LABEL: testv32i16u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 +; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -470,7 +472,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -508,7 +510,7 @@ ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 @@ -535,7 +537,7 @@ ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 @@ -557,15 +559,16 @@ ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8: @@ -574,7 +577,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 @@ -602,7 +605,7 @@ ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 @@ -629,7 +632,7 @@ ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 @@ -651,15 +654,16 @@ ; ; AVX512BW-LABEL: testv64i8u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8u: @@ -668,7 +672,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -75,13 +75,15 @@ ; X64-AVX2-LABEL: mul_v16i8_32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i8_32: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm0 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, ret <16 x i8> %1 @@ -283,7 +285,8 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper @@ -376,10 +379,22 @@ ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: mul_v8i16_17: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-XOP-LABEL: mul_v8i16_17: +; X64-XOP: # %bb.0: +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: retq +; +; X64-AVX2-LABEL: mul_v8i16_17: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17] +; X64-AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512DQ-LABEL: mul_v8i16_17: +; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17] +; X64-AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 } @@ -410,14 +425,16 @@ ; X64-AVX2-LABEL: mul_v16i8_17: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i8_17: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, @@ -539,12 +556,14 @@ ; ; X64-AVX2-LABEL: mul_v16i16_17: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17] +; X64-AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i16_17: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17] +; X64-AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, ret <16 x i16> %1 @@ -578,14 +597,16 @@ ; X64-AVX2-LABEL: mul_v32i8_17: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v32i8_17: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <32 x i8> %a0, @@ -676,10 +697,22 @@ ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: mul_v8i16_neg9: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-XOP-LABEL: mul_v8i16_neg9: +; X64-XOP: # %bb.0: +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: retq +; +; X64-AVX2-LABEL: mul_v8i16_neg9: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X64-AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512DQ-LABEL: mul_v8i16_neg9: +; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X64-AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 } @@ -716,7 +749,8 @@ ; X64-AVX2-LABEL: mul_v16i8_neg5: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 @@ -725,7 +759,8 @@ ; X64-AVX512DQ-LABEL: mul_v16i8_neg5: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X64-AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 @@ -877,12 +912,14 @@ ; ; X64-AVX2-LABEL: mul_v16i16_neg9: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527] +; X64-AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i16_neg9: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527,65527] +; X64-AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, ret <16 x i16> %1 @@ -924,7 +961,8 @@ ; X64-AVX2-LABEL: mul_v32i8_neg5: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X64-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm0 @@ -933,7 +971,8 @@ ; X64-AVX512DQ-LABEL: mul_v32i8_neg5: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X64-AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X64-AVX512DQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %ymm0, %ymm1, %ymm0 @@ -1121,7 +1160,8 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper @@ -1218,10 +1258,22 @@ ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: mul_v8i16_7: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-XOP-LABEL: mul_v8i16_7: +; X64-XOP: # %bb.0: +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: retq +; +; X64-AVX2-LABEL: mul_v8i16_7: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512DQ-LABEL: mul_v8i16_7: +; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; X64-AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 } @@ -1254,14 +1306,16 @@ ; X64-AVX2-LABEL: mul_v16i8_31: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i8_31: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $5, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm0, %xmm1, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, @@ -1344,10 +1398,22 @@ ; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: mul_v8i16_neg31: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-XOP-LABEL: mul_v8i16_neg31: +; X64-XOP: # %bb.0: +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: retq +; +; X64-AVX2-LABEL: mul_v8i16_neg31: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65505,65505,65505,65505,65505,65505,65505,65505] +; X64-AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512DQ-LABEL: mul_v8i16_neg31: +; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65505,65505,65505,65505,65505,65505,65505,65505] +; X64-AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 } @@ -1378,14 +1444,16 @@ ; X64-AVX2-LABEL: mul_v16i8_neg15: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i8_neg15: ; X64-AVX512DQ: # %bb.0: ; X64-AVX512DQ-NEXT: vpsllw $4, %xmm0, %xmm1 -; X64-AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i8> %a0, @@ -1490,14 +1558,15 @@ ; ; X64-AVX2-LABEL: mul_v2i64_neg_15_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63: @@ -1551,14 +1620,15 @@ ; ; X64-AVX2-LABEL: mul_v2i64_neg_17_65: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65: @@ -1821,7 +1891,8 @@ ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-pack-128.ll b/llvm/test/CodeGen/X86/vector-pack-128.ll --- a/llvm/test/CodeGen/X86/vector-pack-128.ll +++ b/llvm/test/CodeGen/X86/vector-pack-128.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 ; trunc(concat(x,y)) -> pack @@ -95,12 +95,28 @@ ; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_concat_packsswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_concat_packsswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_concat_packsswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_concat_packsswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = ashr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> @@ -116,12 +132,28 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_concat_packuswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_concat_packuswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_concat_packuswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_concat_packuswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> @@ -233,12 +265,30 @@ ; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: concat_trunc_packsswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: concat_trunc_packsswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_packsswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_packsswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = ashr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = trunc <8 x i16> %1 to <8 x i8> @@ -255,12 +305,28 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: concat_trunc_packuswb_128: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: concat_trunc_packuswb_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_packuswb_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_packuswb_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = and <8 x i16> %a1, %3 = trunc <8 x i16> %1 to <8 x i8> diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -96,14 +96,16 @@ ; AVX2-LABEL: trunc_concat_packsswb_256: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_concat_packsswb_256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -116,7 +118,8 @@ ; AVX512BW-LABEL: trunc_concat_packsswb_256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -145,14 +148,16 @@ ; AVX2-LABEL: trunc_concat_packuswb_256: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_concat_packuswb_256: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -165,7 +170,8 @@ ; AVX512BW-LABEL: trunc_concat_packuswb_256: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -243,7 +249,8 @@ ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -254,7 +261,8 @@ ; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -291,7 +299,8 @@ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -304,7 +313,8 @@ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -315,7 +325,8 @@ ; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -352,7 +363,8 @@ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -365,7 +377,8 @@ ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -376,7 +389,8 @@ ; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -51,7 +51,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] @@ -74,7 +76,8 @@ ; AVX512BW-LABEL: trunc_concat_packsswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] @@ -97,7 +100,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] @@ -120,7 +125,8 @@ ; AVX512BW-LABEL: trunc_concat_packuswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] @@ -195,7 +201,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -207,7 +214,8 @@ ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -238,7 +246,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] @@ -250,7 +259,8 @@ ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15] diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -255,11 +255,25 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: cmpeq_zext_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: cmpeq_zext_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: cmpeq_zext_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: cmpeq_zext_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %cmp = icmp eq <16 x i8> %a, %b %zext = zext <16 x i1> %cmp to <16 x i8> ret <16 x i8> %zext @@ -388,13 +402,15 @@ ; AVX2-LABEL: cmpgt_zext_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cmpgt_zext_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %cmp = icmp sgt <32 x i8> %a, %b %zext = zext <32 x i1> %cmp to <32 x i8> diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -69,14 +69,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_1_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -108,7 +110,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -116,7 +118,8 @@ ; BITALG-LABEL: ult_2_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -214,7 +217,7 @@ ; ; AVX2-LABEL: ugt_2_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -222,7 +225,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -231,7 +235,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -240,7 +245,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -248,14 +254,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_2_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -355,7 +363,7 @@ ; ; AVX2-LABEL: ult_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -363,7 +371,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -372,7 +381,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -382,7 +391,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -391,7 +400,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -399,7 +408,8 @@ ; BITALG-LABEL: ult_3_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -497,7 +507,7 @@ ; ; AVX2-LABEL: ugt_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -505,7 +515,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -514,7 +525,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -523,7 +535,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -531,14 +544,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_3_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -638,7 +653,7 @@ ; ; AVX2-LABEL: ult_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -646,7 +661,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -655,7 +671,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -665,7 +681,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -674,7 +690,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -682,7 +698,8 @@ ; BITALG-LABEL: ult_4_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -780,7 +797,7 @@ ; ; AVX2-LABEL: ugt_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -788,7 +805,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -797,7 +815,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -806,7 +825,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -814,14 +834,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_4_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -921,7 +943,7 @@ ; ; AVX2-LABEL: ult_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -929,7 +951,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -938,7 +961,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -948,7 +971,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -957,7 +980,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -965,7 +988,8 @@ ; BITALG-LABEL: ult_5_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1063,7 +1087,7 @@ ; ; AVX2-LABEL: ugt_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1071,7 +1095,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1080,7 +1105,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1089,7 +1115,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1097,14 +1124,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_5_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1204,7 +1233,7 @@ ; ; AVX2-LABEL: ult_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1212,7 +1241,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1221,7 +1251,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1231,7 +1261,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1240,7 +1270,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1248,7 +1278,8 @@ ; BITALG-LABEL: ult_6_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1346,7 +1377,7 @@ ; ; AVX2-LABEL: ugt_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1354,7 +1385,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1363,7 +1395,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1372,7 +1405,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1380,14 +1414,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_6_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1487,7 +1523,7 @@ ; ; AVX2-LABEL: ult_7_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1495,7 +1531,8 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1504,7 +1541,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1514,7 +1551,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1523,7 +1560,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1531,7 +1568,8 @@ ; BITALG-LABEL: ult_7_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltub %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1598,14 +1636,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_1_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1637,7 +1677,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1645,7 +1685,8 @@ ; BITALG-LABEL: ult_2_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1757,7 +1798,7 @@ ; ; AVX2-LABEL: ugt_2_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1768,7 +1809,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v8i16: @@ -1776,7 +1818,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1785,7 +1828,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1793,14 +1837,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_2_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1917,7 +1963,7 @@ ; ; AVX2-LABEL: ult_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1928,7 +1974,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1937,7 +1983,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1947,7 +1993,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1956,7 +2002,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1964,7 +2010,8 @@ ; BITALG-LABEL: ult_3_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2076,7 +2123,7 @@ ; ; AVX2-LABEL: ugt_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2087,7 +2134,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v8i16: @@ -2095,7 +2143,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2104,7 +2153,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2112,14 +2162,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_3_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2236,7 +2288,7 @@ ; ; AVX2-LABEL: ult_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2247,7 +2299,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2256,7 +2308,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2266,7 +2318,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2275,7 +2327,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2283,7 +2335,8 @@ ; BITALG-LABEL: ult_4_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2395,7 +2448,7 @@ ; ; AVX2-LABEL: ugt_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2406,7 +2459,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v8i16: @@ -2414,7 +2468,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2423,7 +2478,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2431,14 +2487,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_4_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2555,7 +2613,7 @@ ; ; AVX2-LABEL: ult_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2566,7 +2624,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2575,7 +2633,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2585,7 +2643,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2594,7 +2652,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2602,7 +2660,8 @@ ; BITALG-LABEL: ult_5_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2714,7 +2773,7 @@ ; ; AVX2-LABEL: ugt_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2725,7 +2784,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v8i16: @@ -2733,7 +2793,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2742,7 +2803,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2750,14 +2812,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_5_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2874,7 +2938,7 @@ ; ; AVX2-LABEL: ult_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2885,7 +2949,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2894,7 +2958,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2904,7 +2968,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2913,7 +2977,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2921,7 +2985,8 @@ ; BITALG-LABEL: ult_6_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3033,7 +3098,7 @@ ; ; AVX2-LABEL: ugt_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3044,7 +3109,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v8i16: @@ -3052,7 +3118,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3061,7 +3128,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3069,14 +3137,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_6_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3193,7 +3263,7 @@ ; ; AVX2-LABEL: ult_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3204,7 +3274,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3213,7 +3283,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3223,7 +3293,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3232,7 +3302,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3240,7 +3310,8 @@ ; BITALG-LABEL: ult_7_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3352,7 +3423,7 @@ ; ; AVX2-LABEL: ugt_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3363,7 +3434,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v8i16: @@ -3371,7 +3443,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3380,7 +3453,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3388,14 +3462,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_7_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3512,7 +3588,7 @@ ; ; AVX2-LABEL: ult_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3523,7 +3599,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3532,7 +3608,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3542,7 +3618,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3551,7 +3627,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3559,7 +3635,8 @@ ; BITALG-LABEL: ult_8_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3671,7 +3748,7 @@ ; ; AVX2-LABEL: ugt_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3682,7 +3759,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v8i16: @@ -3690,7 +3768,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3699,7 +3778,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3707,14 +3787,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_8_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3831,7 +3913,7 @@ ; ; AVX2-LABEL: ult_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3842,7 +3924,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3851,7 +3933,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3861,7 +3943,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3870,7 +3952,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3878,7 +3960,8 @@ ; BITALG-LABEL: ult_9_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3990,7 +4073,7 @@ ; ; AVX2-LABEL: ugt_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4001,7 +4084,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v8i16: @@ -4009,7 +4093,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -4018,7 +4103,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -4026,14 +4112,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_9_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4150,7 +4238,7 @@ ; ; AVX2-LABEL: ult_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4161,7 +4249,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4170,7 +4258,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4180,7 +4268,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4189,7 +4277,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4197,7 +4285,8 @@ ; BITALG-LABEL: ult_10_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4309,7 +4398,7 @@ ; ; AVX2-LABEL: ugt_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4320,7 +4409,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v8i16: @@ -4328,7 +4418,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -4337,7 +4428,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -4345,14 +4437,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_10_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4469,7 +4563,7 @@ ; ; AVX2-LABEL: ult_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4480,7 +4574,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4489,7 +4583,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4499,7 +4593,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4508,7 +4602,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4516,7 +4610,8 @@ ; BITALG-LABEL: ult_11_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4628,7 +4723,7 @@ ; ; AVX2-LABEL: ugt_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4639,7 +4734,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v8i16: @@ -4647,7 +4743,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -4656,7 +4753,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -4664,14 +4762,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_11_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4788,7 +4888,7 @@ ; ; AVX2-LABEL: ult_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4799,7 +4899,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4808,7 +4908,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4818,7 +4918,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4827,7 +4927,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4835,7 +4935,8 @@ ; BITALG-LABEL: ult_12_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4947,7 +5048,7 @@ ; ; AVX2-LABEL: ugt_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4958,7 +5059,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v8i16: @@ -4966,7 +5068,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -4975,7 +5078,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -4983,14 +5087,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_12_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5107,7 +5213,7 @@ ; ; AVX2-LABEL: ult_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5118,7 +5224,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5127,7 +5233,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5137,7 +5243,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5146,7 +5252,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5154,7 +5260,8 @@ ; BITALG-LABEL: ult_13_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5266,7 +5373,7 @@ ; ; AVX2-LABEL: ugt_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5277,7 +5384,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v8i16: @@ -5285,7 +5393,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -5294,7 +5403,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -5302,14 +5412,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_13_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5426,7 +5538,7 @@ ; ; AVX2-LABEL: ult_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5437,7 +5549,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5446,7 +5558,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5456,7 +5568,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5465,7 +5577,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5473,7 +5585,8 @@ ; BITALG-LABEL: ult_14_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5585,7 +5698,7 @@ ; ; AVX2-LABEL: ugt_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5596,7 +5709,8 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v8i16: @@ -5604,7 +5718,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -5613,7 +5728,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -5621,14 +5737,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_14_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5745,7 +5863,7 @@ ; ; AVX2-LABEL: ult_15_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5756,7 +5874,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5765,7 +5883,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5775,7 +5893,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5784,7 +5902,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5792,7 +5910,8 @@ ; BITALG-LABEL: ult_15_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -6063,7 +6182,7 @@ ; ; AVX2-LABEL: ugt_2_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6254,7 +6373,7 @@ ; ; AVX2-LABEL: ult_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6441,7 +6560,7 @@ ; ; AVX2-LABEL: ugt_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6632,7 +6751,7 @@ ; ; AVX2-LABEL: ult_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6819,7 +6938,7 @@ ; ; AVX2-LABEL: ugt_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7010,7 +7129,7 @@ ; ; AVX2-LABEL: ult_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7197,7 +7316,7 @@ ; ; AVX2-LABEL: ugt_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7388,7 +7507,7 @@ ; ; AVX2-LABEL: ult_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7575,7 +7694,7 @@ ; ; AVX2-LABEL: ugt_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7766,7 +7885,7 @@ ; ; AVX2-LABEL: ult_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7953,7 +8072,7 @@ ; ; AVX2-LABEL: ugt_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8144,7 +8263,7 @@ ; ; AVX2-LABEL: ult_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8331,7 +8450,7 @@ ; ; AVX2-LABEL: ugt_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8522,7 +8641,7 @@ ; ; AVX2-LABEL: ult_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8709,7 +8828,7 @@ ; ; AVX2-LABEL: ugt_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8900,7 +9019,7 @@ ; ; AVX2-LABEL: ult_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9087,7 +9206,7 @@ ; ; AVX2-LABEL: ugt_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9278,7 +9397,7 @@ ; ; AVX2-LABEL: ult_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9465,7 +9584,7 @@ ; ; AVX2-LABEL: ugt_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9656,7 +9775,7 @@ ; ; AVX2-LABEL: ult_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9843,7 +9962,7 @@ ; ; AVX2-LABEL: ugt_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10034,7 +10153,7 @@ ; ; AVX2-LABEL: ult_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10221,7 +10340,7 @@ ; ; AVX2-LABEL: ugt_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10412,7 +10531,7 @@ ; ; AVX2-LABEL: ult_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10599,7 +10718,7 @@ ; ; AVX2-LABEL: ugt_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10790,7 +10909,7 @@ ; ; AVX2-LABEL: ult_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10977,7 +11096,7 @@ ; ; AVX2-LABEL: ugt_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11168,7 +11287,7 @@ ; ; AVX2-LABEL: ult_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11355,7 +11474,7 @@ ; ; AVX2-LABEL: ugt_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11546,7 +11665,7 @@ ; ; AVX2-LABEL: ult_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11733,7 +11852,7 @@ ; ; AVX2-LABEL: ugt_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11924,7 +12043,7 @@ ; ; AVX2-LABEL: ult_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12111,7 +12230,7 @@ ; ; AVX2-LABEL: ugt_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12302,7 +12421,7 @@ ; ; AVX2-LABEL: ult_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12489,7 +12608,7 @@ ; ; AVX2-LABEL: ugt_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12680,7 +12799,7 @@ ; ; AVX2-LABEL: ult_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12867,7 +12986,7 @@ ; ; AVX2-LABEL: ugt_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13058,7 +13177,7 @@ ; ; AVX2-LABEL: ult_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13245,7 +13364,7 @@ ; ; AVX2-LABEL: ugt_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13436,7 +13555,7 @@ ; ; AVX2-LABEL: ult_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13623,7 +13742,7 @@ ; ; AVX2-LABEL: ugt_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13814,7 +13933,7 @@ ; ; AVX2-LABEL: ult_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14001,7 +14120,7 @@ ; ; AVX2-LABEL: ugt_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14192,7 +14311,7 @@ ; ; AVX2-LABEL: ult_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14379,7 +14498,7 @@ ; ; AVX2-LABEL: ugt_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14570,7 +14689,7 @@ ; ; AVX2-LABEL: ult_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14757,7 +14876,7 @@ ; ; AVX2-LABEL: ugt_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14948,7 +15067,7 @@ ; ; AVX2-LABEL: ult_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15135,7 +15254,7 @@ ; ; AVX2-LABEL: ugt_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15326,7 +15445,7 @@ ; ; AVX2-LABEL: ult_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15513,7 +15632,7 @@ ; ; AVX2-LABEL: ugt_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15704,7 +15823,7 @@ ; ; AVX2-LABEL: ult_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15891,7 +16010,7 @@ ; ; AVX2-LABEL: ugt_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16082,7 +16201,7 @@ ; ; AVX2-LABEL: ult_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16269,7 +16388,7 @@ ; ; AVX2-LABEL: ugt_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16460,7 +16579,7 @@ ; ; AVX2-LABEL: ult_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16647,7 +16766,7 @@ ; ; AVX2-LABEL: ugt_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16838,7 +16957,7 @@ ; ; AVX2-LABEL: ult_31_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16982,14 +17101,15 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17086,7 +17206,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17094,7 +17214,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_2_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17252,7 +17372,7 @@ ; ; AVX2-LABEL: ugt_2_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17262,21 +17382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17287,7 +17409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -17296,7 +17419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -17437,7 +17560,7 @@ ; ; AVX2-LABEL: ult_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17447,7 +17570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17455,7 +17578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17463,7 +17586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_3_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17474,7 +17597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17484,7 +17607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -17624,7 +17747,7 @@ ; ; AVX2-LABEL: ugt_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17634,21 +17757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17659,7 +17784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -17668,7 +17794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -17809,7 +17935,7 @@ ; ; AVX2-LABEL: ult_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17819,7 +17945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17827,7 +17953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17835,7 +17961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_4_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -17846,7 +17972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17856,7 +17982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -17996,7 +18122,7 @@ ; ; AVX2-LABEL: ugt_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18006,21 +18132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18031,7 +18159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -18040,7 +18169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -18181,7 +18310,7 @@ ; ; AVX2-LABEL: ult_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18191,7 +18320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18199,7 +18328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18207,7 +18336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_5_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18218,7 +18347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18228,7 +18357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -18368,7 +18497,7 @@ ; ; AVX2-LABEL: ugt_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18378,21 +18507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18403,7 +18534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -18412,7 +18544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -18553,7 +18685,7 @@ ; ; AVX2-LABEL: ult_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18563,7 +18695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18571,7 +18703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18579,7 +18711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_6_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18590,7 +18722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18600,7 +18732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -18740,7 +18872,7 @@ ; ; AVX2-LABEL: ugt_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18750,21 +18882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18775,7 +18909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -18784,7 +18919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -18925,7 +19060,7 @@ ; ; AVX2-LABEL: ult_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18935,7 +19070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18943,7 +19078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18951,7 +19086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_7_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -18962,7 +19097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18972,7 +19107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -19112,7 +19247,7 @@ ; ; AVX2-LABEL: ugt_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19122,21 +19257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -19147,7 +19284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -19156,7 +19294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -19297,7 +19435,7 @@ ; ; AVX2-LABEL: ult_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19307,7 +19445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19315,7 +19453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19323,7 +19461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_8_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -19334,7 +19472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19344,7 +19482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -19484,7 +19622,7 @@ ; ; AVX2-LABEL: ugt_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19494,21 +19632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -19519,7 +19659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -19528,7 +19669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -19669,7 +19810,7 @@ ; ; AVX2-LABEL: ult_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19679,7 +19820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19687,7 +19828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19695,7 +19836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_9_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -19706,7 +19847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19716,7 +19857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -19856,7 +19997,7 @@ ; ; AVX2-LABEL: ugt_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19866,21 +20007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -19891,7 +20034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -19900,7 +20044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20041,7 +20185,7 @@ ; ; AVX2-LABEL: ult_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20051,7 +20195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20059,7 +20203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20067,7 +20211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_10_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -20078,7 +20222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20088,7 +20232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20228,7 +20372,7 @@ ; ; AVX2-LABEL: ugt_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20238,21 +20382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -20263,7 +20409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -20272,7 +20419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20413,7 +20560,7 @@ ; ; AVX2-LABEL: ult_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20423,7 +20570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20431,7 +20578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20439,7 +20586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_11_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -20450,7 +20597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20460,7 +20607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20600,7 +20747,7 @@ ; ; AVX2-LABEL: ugt_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20610,21 +20757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -20635,7 +20784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -20644,7 +20794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20785,7 +20935,7 @@ ; ; AVX2-LABEL: ult_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20795,7 +20945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20803,7 +20953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20811,7 +20961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_12_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -20822,7 +20972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20832,7 +20982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -20972,7 +21122,7 @@ ; ; AVX2-LABEL: ugt_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20982,21 +21132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21007,7 +21159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -21016,7 +21169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -21157,7 +21310,7 @@ ; ; AVX2-LABEL: ult_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21167,7 +21320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21175,7 +21328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21183,7 +21336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_13_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21194,7 +21347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21204,7 +21357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -21344,7 +21497,7 @@ ; ; AVX2-LABEL: ugt_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21354,21 +21507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21379,7 +21534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -21388,7 +21544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -21529,7 +21685,7 @@ ; ; AVX2-LABEL: ult_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21539,7 +21695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21547,7 +21703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21555,7 +21711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_14_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21566,7 +21722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21576,7 +21732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -21716,7 +21872,7 @@ ; ; AVX2-LABEL: ugt_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21726,21 +21882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21751,7 +21909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -21760,7 +21919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -21901,7 +22060,7 @@ ; ; AVX2-LABEL: ult_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21911,7 +22070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21919,7 +22078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21927,7 +22086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_15_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -21938,7 +22097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21948,7 +22107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -22088,7 +22247,7 @@ ; ; AVX2-LABEL: ugt_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22098,21 +22257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_15_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_15_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -22123,7 +22284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -22132,7 +22294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -22273,7 +22435,7 @@ ; ; AVX2-LABEL: ult_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22283,7 +22445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22291,7 +22453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22299,7 +22461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_16_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -22310,7 +22472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22320,7 +22482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -22460,7 +22622,7 @@ ; ; AVX2-LABEL: ugt_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22470,21 +22632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_16_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_16_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -22495,7 +22659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -22504,7 +22669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -22645,7 +22810,7 @@ ; ; AVX2-LABEL: ult_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22655,7 +22820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22663,7 +22828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22671,7 +22836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_17_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -22682,7 +22847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22692,7 +22857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -22832,7 +22997,7 @@ ; ; AVX2-LABEL: ugt_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22842,21 +23007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_17_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_17_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -22867,7 +23034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -22876,7 +23044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23017,7 +23185,7 @@ ; ; AVX2-LABEL: ult_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23027,7 +23195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23035,7 +23203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23043,7 +23211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_18_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23054,7 +23222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23064,7 +23232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23204,7 +23372,7 @@ ; ; AVX2-LABEL: ugt_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23214,21 +23382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_18_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_18_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23239,7 +23409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -23248,7 +23419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23389,7 +23560,7 @@ ; ; AVX2-LABEL: ult_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23399,7 +23570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23407,7 +23578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23415,7 +23586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_19_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23426,7 +23597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23436,7 +23607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23576,7 +23747,7 @@ ; ; AVX2-LABEL: ugt_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23586,21 +23757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_19_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_19_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23611,7 +23784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -23620,7 +23794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23761,7 +23935,7 @@ ; ; AVX2-LABEL: ult_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23771,7 +23945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23779,7 +23953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23787,7 +23961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_20_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23798,7 +23972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23808,7 +23982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -23948,7 +24122,7 @@ ; ; AVX2-LABEL: ugt_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23958,21 +24132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_20_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_20_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -23983,7 +24159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -23992,7 +24169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -24133,7 +24310,7 @@ ; ; AVX2-LABEL: ult_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24143,7 +24320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24151,7 +24328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24159,7 +24336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_21_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -24170,7 +24347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24180,7 +24357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -24320,7 +24497,7 @@ ; ; AVX2-LABEL: ugt_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24330,21 +24507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_21_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_21_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -24355,7 +24534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -24364,7 +24544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -24505,7 +24685,7 @@ ; ; AVX2-LABEL: ult_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24515,7 +24695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24523,7 +24703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24531,7 +24711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_22_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -24542,7 +24722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24552,7 +24732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -24692,7 +24872,7 @@ ; ; AVX2-LABEL: ugt_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24702,21 +24882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_22_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_22_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -24727,7 +24909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -24736,7 +24919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -24877,7 +25060,7 @@ ; ; AVX2-LABEL: ult_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24887,7 +25070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24895,7 +25078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24903,7 +25086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_23_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -24914,7 +25097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24924,7 +25107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25064,7 +25247,7 @@ ; ; AVX2-LABEL: ugt_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25074,21 +25257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_23_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_23_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -25099,7 +25284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -25108,7 +25294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25249,7 +25435,7 @@ ; ; AVX2-LABEL: ult_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25259,7 +25445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25267,7 +25453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25275,7 +25461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_24_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -25286,7 +25472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25296,7 +25482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25436,7 +25622,7 @@ ; ; AVX2-LABEL: ugt_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25446,21 +25632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_24_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_24_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -25471,7 +25659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -25480,7 +25669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25621,7 +25810,7 @@ ; ; AVX2-LABEL: ult_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25631,7 +25820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25639,7 +25828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25647,7 +25836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_25_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -25658,7 +25847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25668,7 +25857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25808,7 +25997,7 @@ ; ; AVX2-LABEL: ugt_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25818,21 +26007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_25_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_25_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -25843,7 +26034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -25852,7 +26044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -25993,7 +26185,7 @@ ; ; AVX2-LABEL: ult_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26003,7 +26195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26011,7 +26203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26019,7 +26211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_26_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26030,7 +26222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26040,7 +26232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -26180,7 +26372,7 @@ ; ; AVX2-LABEL: ugt_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26190,21 +26382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_26_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_26_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26215,7 +26409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -26224,7 +26419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -26365,7 +26560,7 @@ ; ; AVX2-LABEL: ult_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26375,7 +26570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26383,7 +26578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26391,7 +26586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_27_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26402,7 +26597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26412,7 +26607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -26552,7 +26747,7 @@ ; ; AVX2-LABEL: ugt_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26562,21 +26757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_27_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_27_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26587,7 +26784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -26596,7 +26794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -26737,7 +26935,7 @@ ; ; AVX2-LABEL: ult_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26747,7 +26945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26755,7 +26953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26763,7 +26961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_28_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26774,7 +26972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26784,7 +26982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -26924,7 +27122,7 @@ ; ; AVX2-LABEL: ugt_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26934,21 +27132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_28_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_28_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -26959,7 +27159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -26968,7 +27169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -27109,7 +27310,7 @@ ; ; AVX2-LABEL: ult_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27119,7 +27320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27127,7 +27328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27135,7 +27336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_29_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -27146,7 +27347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27156,7 +27357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -27296,7 +27497,7 @@ ; ; AVX2-LABEL: ugt_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27306,21 +27507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_29_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_29_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -27331,7 +27534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -27340,7 +27544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -27481,7 +27685,7 @@ ; ; AVX2-LABEL: ult_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27491,7 +27695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27499,7 +27703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27507,7 +27711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_30_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -27518,7 +27722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27528,7 +27732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -27668,7 +27872,7 @@ ; ; AVX2-LABEL: ugt_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27678,21 +27882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_30_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_30_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -27703,7 +27909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -27712,7 +27919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -27853,7 +28060,7 @@ ; ; AVX2-LABEL: ult_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27863,7 +28070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27871,7 +28078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27879,7 +28086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_31_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -27890,7 +28097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27900,7 +28107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28040,7 +28247,7 @@ ; ; AVX2-LABEL: ugt_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28050,21 +28257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_31_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_31_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -28075,7 +28284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -28084,7 +28294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28225,7 +28435,7 @@ ; ; AVX2-LABEL: ult_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28235,7 +28445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28243,7 +28453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28251,7 +28461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_32_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -28262,7 +28472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28272,7 +28482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28412,7 +28622,7 @@ ; ; AVX2-LABEL: ugt_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28422,21 +28632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_32_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_32_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -28447,7 +28659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -28456,7 +28669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28597,7 +28810,7 @@ ; ; AVX2-LABEL: ult_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28607,7 +28820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28615,7 +28828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28623,7 +28836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_33_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -28634,7 +28847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28644,7 +28857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28784,7 +28997,7 @@ ; ; AVX2-LABEL: ugt_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28794,21 +29007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_33_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_33_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -28819,7 +29034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -28828,7 +29044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -28969,7 +29185,7 @@ ; ; AVX2-LABEL: ult_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28979,7 +29195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28987,7 +29203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28995,7 +29211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_34_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29006,7 +29222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29016,7 +29232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -29156,7 +29372,7 @@ ; ; AVX2-LABEL: ugt_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29166,21 +29382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_34_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_34_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29191,7 +29409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -29200,7 +29419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -29341,7 +29560,7 @@ ; ; AVX2-LABEL: ult_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29351,7 +29570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29359,7 +29578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29367,7 +29586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_35_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29378,7 +29597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29388,7 +29607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -29528,7 +29747,7 @@ ; ; AVX2-LABEL: ugt_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29538,21 +29757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_35_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_35_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29563,7 +29784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -29572,7 +29794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -29713,7 +29935,7 @@ ; ; AVX2-LABEL: ult_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29723,7 +29945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29731,7 +29953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29739,7 +29961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_36_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29750,7 +29972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29760,7 +29982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -29900,7 +30122,7 @@ ; ; AVX2-LABEL: ugt_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29910,21 +30132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_36_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_36_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -29935,7 +30159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -29944,7 +30169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -30085,7 +30310,7 @@ ; ; AVX2-LABEL: ult_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30095,7 +30320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30103,7 +30328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30111,7 +30336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_37_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -30122,7 +30347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30132,7 +30357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -30272,7 +30497,7 @@ ; ; AVX2-LABEL: ugt_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30282,21 +30507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_37_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_37_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -30307,7 +30534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -30316,7 +30544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -30457,7 +30685,7 @@ ; ; AVX2-LABEL: ult_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30467,7 +30695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30475,7 +30703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30483,7 +30711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_38_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -30494,7 +30722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30504,7 +30732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -30644,7 +30872,7 @@ ; ; AVX2-LABEL: ugt_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30654,21 +30882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_38_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_38_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -30679,7 +30909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -30688,7 +30919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -30829,7 +31060,7 @@ ; ; AVX2-LABEL: ult_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30839,7 +31070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30847,7 +31078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30855,7 +31086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_39_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -30866,7 +31097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30876,7 +31107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31016,7 +31247,7 @@ ; ; AVX2-LABEL: ugt_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31026,21 +31257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_39_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_39_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31051,7 +31284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -31060,7 +31294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31201,7 +31435,7 @@ ; ; AVX2-LABEL: ult_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31211,7 +31445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31219,7 +31453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31227,7 +31461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_40_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31238,7 +31472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31248,7 +31482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31388,7 +31622,7 @@ ; ; AVX2-LABEL: ugt_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31398,21 +31632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_40_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_40_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31423,7 +31659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -31432,7 +31669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31573,7 +31810,7 @@ ; ; AVX2-LABEL: ult_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31583,7 +31820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31591,7 +31828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31599,7 +31836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_41_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31610,7 +31847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31620,7 +31857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31760,7 +31997,7 @@ ; ; AVX2-LABEL: ugt_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31770,21 +32007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_41_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_41_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31795,7 +32034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -31804,7 +32044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -31945,7 +32185,7 @@ ; ; AVX2-LABEL: ult_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31955,7 +32195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31963,7 +32203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31971,7 +32211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_42_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -31982,7 +32222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31992,7 +32232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -32132,7 +32372,7 @@ ; ; AVX2-LABEL: ugt_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32142,21 +32382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_42_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_42_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -32167,7 +32409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -32176,7 +32419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -32317,7 +32560,7 @@ ; ; AVX2-LABEL: ult_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32327,7 +32570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32335,7 +32578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32343,7 +32586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_43_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -32354,7 +32597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32364,7 +32607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -32504,7 +32747,7 @@ ; ; AVX2-LABEL: ugt_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32514,21 +32757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_43_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_43_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -32539,7 +32784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -32548,7 +32794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -32689,7 +32935,7 @@ ; ; AVX2-LABEL: ult_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32699,7 +32945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32707,7 +32953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32715,7 +32961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_44_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -32726,7 +32972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32736,7 +32982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -32876,7 +33122,7 @@ ; ; AVX2-LABEL: ugt_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32886,21 +33132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_44_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_44_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -32911,7 +33159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -32920,7 +33169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33061,7 +33310,7 @@ ; ; AVX2-LABEL: ult_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33071,7 +33320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33079,7 +33328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33087,7 +33336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_45_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -33098,7 +33347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33108,7 +33357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33248,7 +33497,7 @@ ; ; AVX2-LABEL: ugt_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33258,21 +33507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_45_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_45_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -33283,7 +33534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -33292,7 +33544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33433,7 +33685,7 @@ ; ; AVX2-LABEL: ult_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33443,7 +33695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33451,7 +33703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33459,7 +33711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_46_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -33470,7 +33722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33480,7 +33732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33620,7 +33872,7 @@ ; ; AVX2-LABEL: ugt_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33630,21 +33882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_46_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_46_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -33655,7 +33909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -33664,7 +33919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33805,7 +34060,7 @@ ; ; AVX2-LABEL: ult_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33815,7 +34070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33823,7 +34078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33831,7 +34086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_47_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -33842,7 +34097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33852,7 +34107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -33992,7 +34247,7 @@ ; ; AVX2-LABEL: ugt_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34002,21 +34257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_47_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_47_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34027,7 +34284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -34036,7 +34294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -34177,7 +34435,7 @@ ; ; AVX2-LABEL: ult_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34187,7 +34445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34195,7 +34453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34203,7 +34461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_48_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34214,7 +34472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34224,7 +34482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -34364,7 +34622,7 @@ ; ; AVX2-LABEL: ugt_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34374,21 +34632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_48_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_48_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34399,7 +34659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -34408,7 +34669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -34549,7 +34810,7 @@ ; ; AVX2-LABEL: ult_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34559,7 +34820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34567,7 +34828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34575,7 +34836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_49_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34586,7 +34847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34596,7 +34857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -34736,7 +34997,7 @@ ; ; AVX2-LABEL: ugt_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34746,21 +35007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_49_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_49_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34771,7 +35034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -34780,7 +35044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -34921,7 +35185,7 @@ ; ; AVX2-LABEL: ult_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34931,7 +35195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34939,7 +35203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34947,7 +35211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_50_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -34958,7 +35222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34968,7 +35232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -35108,7 +35372,7 @@ ; ; AVX2-LABEL: ugt_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35118,21 +35382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_50_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_50_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -35143,7 +35409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -35152,7 +35419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -35293,7 +35560,7 @@ ; ; AVX2-LABEL: ult_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35303,7 +35570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35311,7 +35578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35319,7 +35586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_51_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -35330,7 +35597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35340,7 +35607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -35480,7 +35747,7 @@ ; ; AVX2-LABEL: ugt_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35490,21 +35757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_51_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_51_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -35515,7 +35784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -35524,7 +35794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -35665,7 +35935,7 @@ ; ; AVX2-LABEL: ult_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35675,7 +35945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35683,7 +35953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35691,7 +35961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_52_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -35702,7 +35972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35712,7 +35982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -35852,7 +36122,7 @@ ; ; AVX2-LABEL: ugt_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35862,21 +36132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_52_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_52_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -35887,7 +36159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -35896,7 +36169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36037,7 +36310,7 @@ ; ; AVX2-LABEL: ult_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36047,7 +36320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36055,7 +36328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36063,7 +36336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_53_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -36074,7 +36347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36084,7 +36357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36224,7 +36497,7 @@ ; ; AVX2-LABEL: ugt_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36234,21 +36507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_53_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_53_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -36259,7 +36534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -36268,7 +36544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36409,7 +36685,7 @@ ; ; AVX2-LABEL: ult_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36419,7 +36695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36427,7 +36703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36435,7 +36711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_54_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -36446,7 +36722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36456,7 +36732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36596,7 +36872,7 @@ ; ; AVX2-LABEL: ugt_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36606,21 +36882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_54_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_54_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -36631,7 +36909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -36640,7 +36919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36781,7 +37060,7 @@ ; ; AVX2-LABEL: ult_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36791,7 +37070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36799,7 +37078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36807,7 +37086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_55_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -36818,7 +37097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36828,7 +37107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -36968,7 +37247,7 @@ ; ; AVX2-LABEL: ugt_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36978,21 +37257,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_55_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_55_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37003,7 +37284,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -37012,7 +37294,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -37153,7 +37435,7 @@ ; ; AVX2-LABEL: ult_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -37163,7 +37445,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -37171,7 +37453,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -37179,7 +37461,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_56_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37190,7 +37472,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -37200,7 +37482,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -37340,7 +37622,7 @@ ; ; AVX2-LABEL: ugt_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -37350,21 +37632,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_56_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_56_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37375,7 +37659,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -37384,7 +37669,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -37525,7 +37810,7 @@ ; ; AVX2-LABEL: ult_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -37535,7 +37820,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -37543,7 +37828,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -37551,7 +37836,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_57_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37562,7 +37847,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -37572,7 +37857,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -37712,7 +37997,7 @@ ; ; AVX2-LABEL: ugt_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -37722,21 +38007,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_57_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_57_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37747,7 +38034,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -37756,7 +38044,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -37897,7 +38185,7 @@ ; ; AVX2-LABEL: ult_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -37907,7 +38195,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -37915,7 +38203,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -37923,7 +38211,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_58_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -37934,7 +38222,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -37944,7 +38232,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -38084,7 +38372,7 @@ ; ; AVX2-LABEL: ugt_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -38094,21 +38382,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_58_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_58_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -38119,7 +38409,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -38128,7 +38419,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -38269,7 +38560,7 @@ ; ; AVX2-LABEL: ult_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -38279,7 +38570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -38287,7 +38578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -38295,7 +38586,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_59_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -38306,7 +38597,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -38316,7 +38607,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -38456,7 +38747,7 @@ ; ; AVX2-LABEL: ugt_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -38466,21 +38757,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_59_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_59_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -38491,7 +38784,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -38500,7 +38794,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -38641,7 +38935,7 @@ ; ; AVX2-LABEL: ult_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -38651,7 +38945,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -38659,7 +38953,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -38667,7 +38961,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_60_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -38678,7 +38972,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -38688,7 +38982,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -38828,7 +39122,7 @@ ; ; AVX2-LABEL: ugt_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -38838,21 +39132,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_60_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_60_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -38863,7 +39159,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -38872,7 +39169,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -39013,7 +39310,7 @@ ; ; AVX2-LABEL: ult_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -39023,7 +39320,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -39031,7 +39328,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -39039,7 +39336,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_61_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -39050,7 +39347,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -39060,7 +39357,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -39200,7 +39497,7 @@ ; ; AVX2-LABEL: ugt_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -39210,21 +39507,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_61_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_61_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -39235,7 +39534,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -39244,7 +39544,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -39385,7 +39685,7 @@ ; ; AVX2-LABEL: ult_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -39395,7 +39695,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -39403,7 +39703,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -39411,7 +39711,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_62_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -39422,7 +39722,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -39432,7 +39732,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -39572,7 +39872,7 @@ ; ; AVX2-LABEL: ugt_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -39582,21 +39882,23 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_62_v2i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_62_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -39607,7 +39909,8 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; @@ -39616,7 +39919,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq @@ -39757,7 +40060,7 @@ ; ; AVX2-LABEL: ult_63_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -39767,7 +40070,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -39775,7 +40078,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -39783,7 +40086,7 @@ ; AVX512VPOPCNTDQVL-LABEL: ult_63_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VPOPCNTDQVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512VPOPCNTDQVL-NEXT: retq @@ -39794,7 +40097,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -39804,7 +40107,7 @@ ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; BITALG-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; BITALG-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; BITALG-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -84,20 +84,6 @@ ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv2i64: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -235,24 +221,6 @@ ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv4i32: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv4i32: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -390,21 +358,6 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv8i16: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1OR2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -518,18 +471,6 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv16i8: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv16i8: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -746,14 +687,16 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQ-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: eq_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: eq_1_v2i64: @@ -879,7 +822,8 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQ-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper @@ -888,7 +832,8 @@ ; AVX512VPOPCNTDQVL-LABEL: ne_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1133,14 +1078,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) %3 = icmp eq <8 x i16> %2, @@ -1214,7 +1161,8 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1223,7 +1171,8 @@ ; BITALG-LABEL: ne_1_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1281,14 +1230,16 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) %3 = icmp eq <16 x i8> %2, @@ -1362,7 +1313,8 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1371,7 +1323,8 @@ ; BITALG-LABEL: ne_1_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -59,13 +59,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_1_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -120,14 +122,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_2_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -164,7 +167,7 @@ ; ; AVX2-LABEL: ugt_2_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -172,13 +175,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -186,13 +190,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -200,7 +205,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -208,13 +214,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_2_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -251,7 +259,7 @@ ; ; AVX2-LABEL: ult_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -259,13 +267,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -273,13 +282,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -287,7 +297,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQVL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -295,14 +306,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_3_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -339,7 +351,7 @@ ; ; AVX2-LABEL: ugt_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -347,13 +359,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -361,13 +374,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -375,7 +389,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -383,13 +398,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_3_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -426,7 +443,7 @@ ; ; AVX2-LABEL: ult_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -434,13 +451,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -448,13 +466,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -462,7 +481,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -470,14 +490,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_4_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -514,7 +535,7 @@ ; ; AVX2-LABEL: ugt_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -522,13 +543,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -536,13 +558,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -550,7 +573,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -558,13 +582,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_4_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -601,7 +627,7 @@ ; ; AVX2-LABEL: ult_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -609,13 +635,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -623,13 +650,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -637,7 +665,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -645,14 +674,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_5_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -689,7 +719,7 @@ ; ; AVX2-LABEL: ugt_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -697,13 +727,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -711,13 +742,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -725,7 +757,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -733,13 +766,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_5_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -776,7 +811,7 @@ ; ; AVX2-LABEL: ult_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -784,13 +819,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -798,13 +834,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -812,7 +849,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -820,14 +858,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_6_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -864,7 +903,7 @@ ; ; AVX2-LABEL: ugt_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -872,13 +911,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -886,13 +926,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -900,7 +941,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -908,13 +950,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_6_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -951,7 +995,7 @@ ; ; AVX2-LABEL: ult_7_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -959,13 +1003,14 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -973,13 +1018,14 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -987,7 +1033,8 @@ ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -995,14 +1042,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_7_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -1063,13 +1111,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_1_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1124,14 +1174,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_2_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1172,7 +1223,7 @@ ; ; AVX2-LABEL: ugt_2_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1183,7 +1234,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v16i16: @@ -1191,7 +1243,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v16i16: @@ -1199,20 +1252,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_2_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_2_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1253,7 +1309,7 @@ ; ; AVX2-LABEL: ult_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1264,7 +1320,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1273,7 +1329,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1282,7 +1338,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1290,14 +1346,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_3_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1338,7 +1395,7 @@ ; ; AVX2-LABEL: ugt_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1349,7 +1406,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v16i16: @@ -1357,7 +1415,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v16i16: @@ -1365,20 +1424,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_3_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_3_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1419,7 +1481,7 @@ ; ; AVX2-LABEL: ult_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1430,7 +1492,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1439,7 +1501,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1448,7 +1510,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1456,14 +1518,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_4_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1504,7 +1567,7 @@ ; ; AVX2-LABEL: ugt_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1515,7 +1578,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v16i16: @@ -1523,7 +1587,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v16i16: @@ -1531,20 +1596,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_4_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_4_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1585,7 +1653,7 @@ ; ; AVX2-LABEL: ult_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1596,7 +1664,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1605,7 +1673,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1614,7 +1682,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1622,14 +1690,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_5_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1670,7 +1739,7 @@ ; ; AVX2-LABEL: ugt_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1681,7 +1750,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v16i16: @@ -1689,7 +1759,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v16i16: @@ -1697,20 +1768,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_5_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_5_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1751,7 +1825,7 @@ ; ; AVX2-LABEL: ult_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1762,7 +1836,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1771,7 +1845,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1780,7 +1854,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1788,14 +1862,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_6_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1836,7 +1911,7 @@ ; ; AVX2-LABEL: ugt_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1847,7 +1922,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v16i16: @@ -1855,7 +1931,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v16i16: @@ -1863,20 +1940,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_6_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_6_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1917,7 +1997,7 @@ ; ; AVX2-LABEL: ult_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1928,7 +2008,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1937,7 +2017,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1946,7 +2026,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1954,14 +2034,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_7_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2002,7 +2083,7 @@ ; ; AVX2-LABEL: ugt_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2013,7 +2094,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v16i16: @@ -2021,7 +2103,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_7_v16i16: @@ -2029,20 +2112,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_7_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_7_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2083,7 +2169,7 @@ ; ; AVX2-LABEL: ult_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2094,7 +2180,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2103,7 +2189,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2112,7 +2198,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2120,14 +2206,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_8_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2168,7 +2255,7 @@ ; ; AVX2-LABEL: ugt_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2179,7 +2266,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v16i16: @@ -2187,7 +2275,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_8_v16i16: @@ -2195,20 +2284,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_8_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_8_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2249,7 +2341,7 @@ ; ; AVX2-LABEL: ult_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2260,7 +2352,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2269,7 +2361,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2278,7 +2370,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2286,14 +2378,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_9_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2334,7 +2427,7 @@ ; ; AVX2-LABEL: ugt_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2345,7 +2438,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v16i16: @@ -2353,7 +2447,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_9_v16i16: @@ -2361,20 +2456,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_9_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_9_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2415,7 +2513,7 @@ ; ; AVX2-LABEL: ult_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2426,7 +2524,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2435,7 +2533,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2444,7 +2542,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2452,14 +2550,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_10_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2500,7 +2599,7 @@ ; ; AVX2-LABEL: ugt_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2511,7 +2610,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v16i16: @@ -2519,7 +2619,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_10_v16i16: @@ -2527,20 +2628,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_10_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_10_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2581,7 +2685,7 @@ ; ; AVX2-LABEL: ult_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2592,7 +2696,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2601,7 +2705,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2610,7 +2714,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2618,14 +2722,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_11_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2666,7 +2771,7 @@ ; ; AVX2-LABEL: ugt_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2677,7 +2782,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v16i16: @@ -2685,7 +2791,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_11_v16i16: @@ -2693,20 +2800,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_11_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_11_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2747,7 +2857,7 @@ ; ; AVX2-LABEL: ult_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2758,7 +2868,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2767,7 +2877,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2776,7 +2886,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2784,14 +2894,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_12_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2832,7 +2943,7 @@ ; ; AVX2-LABEL: ugt_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2843,7 +2954,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v16i16: @@ -2851,7 +2963,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_12_v16i16: @@ -2859,20 +2972,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_12_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_12_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2913,7 +3029,7 @@ ; ; AVX2-LABEL: ult_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2924,7 +3040,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2933,7 +3049,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2942,7 +3058,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2950,14 +3066,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_13_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2998,7 +3115,7 @@ ; ; AVX2-LABEL: ugt_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3009,7 +3126,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v16i16: @@ -3017,7 +3135,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_13_v16i16: @@ -3025,20 +3144,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_13_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_13_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3079,7 +3201,7 @@ ; ; AVX2-LABEL: ult_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3090,7 +3212,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3099,7 +3221,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3108,7 +3230,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3116,14 +3238,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_14_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3164,7 +3287,7 @@ ; ; AVX2-LABEL: ugt_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3175,7 +3298,8 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v16i16: @@ -3183,7 +3307,8 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ugt_14_v16i16: @@ -3191,20 +3316,23 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: ugt_14_v16i16: ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ugt_14_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3245,7 +3373,7 @@ ; ; AVX2-LABEL: ult_15_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3256,7 +3384,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3265,7 +3393,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3274,7 +3402,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3282,14 +3410,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_15_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3467,7 +3596,7 @@ ; ; AVX2-LABEL: ugt_2_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3571,7 +3700,7 @@ ; ; AVX2-LABEL: ult_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3675,7 +3804,7 @@ ; ; AVX2-LABEL: ugt_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3779,7 +3908,7 @@ ; ; AVX2-LABEL: ult_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3883,7 +4012,7 @@ ; ; AVX2-LABEL: ugt_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3987,7 +4116,7 @@ ; ; AVX2-LABEL: ult_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4091,7 +4220,7 @@ ; ; AVX2-LABEL: ugt_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4195,7 +4324,7 @@ ; ; AVX2-LABEL: ult_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4299,7 +4428,7 @@ ; ; AVX2-LABEL: ugt_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4403,7 +4532,7 @@ ; ; AVX2-LABEL: ult_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4507,7 +4636,7 @@ ; ; AVX2-LABEL: ugt_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4611,7 +4740,7 @@ ; ; AVX2-LABEL: ult_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4715,7 +4844,7 @@ ; ; AVX2-LABEL: ugt_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4819,7 +4948,7 @@ ; ; AVX2-LABEL: ult_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4923,7 +5052,7 @@ ; ; AVX2-LABEL: ugt_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5027,7 +5156,7 @@ ; ; AVX2-LABEL: ult_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5131,7 +5260,7 @@ ; ; AVX2-LABEL: ugt_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5235,7 +5364,7 @@ ; ; AVX2-LABEL: ult_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5339,7 +5468,7 @@ ; ; AVX2-LABEL: ugt_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5443,7 +5572,7 @@ ; ; AVX2-LABEL: ult_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5547,7 +5676,7 @@ ; ; AVX2-LABEL: ugt_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5651,7 +5780,7 @@ ; ; AVX2-LABEL: ult_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5755,7 +5884,7 @@ ; ; AVX2-LABEL: ugt_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5859,7 +5988,7 @@ ; ; AVX2-LABEL: ult_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5963,7 +6092,7 @@ ; ; AVX2-LABEL: ugt_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6067,7 +6196,7 @@ ; ; AVX2-LABEL: ult_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6171,7 +6300,7 @@ ; ; AVX2-LABEL: ugt_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6275,7 +6404,7 @@ ; ; AVX2-LABEL: ult_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6379,7 +6508,7 @@ ; ; AVX2-LABEL: ugt_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6483,7 +6612,7 @@ ; ; AVX2-LABEL: ult_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6587,7 +6716,7 @@ ; ; AVX2-LABEL: ugt_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6691,7 +6820,7 @@ ; ; AVX2-LABEL: ult_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6795,7 +6924,7 @@ ; ; AVX2-LABEL: ugt_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6899,7 +7028,7 @@ ; ; AVX2-LABEL: ult_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7003,7 +7132,7 @@ ; ; AVX2-LABEL: ugt_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7107,7 +7236,7 @@ ; ; AVX2-LABEL: ult_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7211,7 +7340,7 @@ ; ; AVX2-LABEL: ugt_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7315,7 +7444,7 @@ ; ; AVX2-LABEL: ult_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7419,7 +7548,7 @@ ; ; AVX2-LABEL: ugt_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7523,7 +7652,7 @@ ; ; AVX2-LABEL: ult_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7627,7 +7756,7 @@ ; ; AVX2-LABEL: ugt_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7731,7 +7860,7 @@ ; ; AVX2-LABEL: ult_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7835,7 +7964,7 @@ ; ; AVX2-LABEL: ugt_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7939,7 +8068,7 @@ ; ; AVX2-LABEL: ult_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8043,7 +8172,7 @@ ; ; AVX2-LABEL: ugt_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8147,7 +8276,7 @@ ; ; AVX2-LABEL: ult_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8251,7 +8380,7 @@ ; ; AVX2-LABEL: ugt_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8355,7 +8484,7 @@ ; ; AVX2-LABEL: ult_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8459,7 +8588,7 @@ ; ; AVX2-LABEL: ugt_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8563,7 +8692,7 @@ ; ; AVX2-LABEL: ult_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8667,7 +8796,7 @@ ; ; AVX2-LABEL: ugt_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8771,7 +8900,7 @@ ; ; AVX2-LABEL: ult_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8875,7 +9004,7 @@ ; ; AVX2-LABEL: ugt_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8979,7 +9108,7 @@ ; ; AVX2-LABEL: ult_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9083,7 +9212,7 @@ ; ; AVX2-LABEL: ugt_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9187,7 +9316,7 @@ ; ; AVX2-LABEL: ult_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9291,7 +9420,7 @@ ; ; AVX2-LABEL: ugt_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9395,7 +9524,7 @@ ; ; AVX2-LABEL: ult_31_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9623,7 +9752,7 @@ ; ; AVX2-LABEL: ugt_2_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9707,7 +9836,7 @@ ; ; AVX2-LABEL: ult_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9791,7 +9920,7 @@ ; ; AVX2-LABEL: ugt_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9875,7 +10004,7 @@ ; ; AVX2-LABEL: ult_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9959,7 +10088,7 @@ ; ; AVX2-LABEL: ugt_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10043,7 +10172,7 @@ ; ; AVX2-LABEL: ult_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10127,7 +10256,7 @@ ; ; AVX2-LABEL: ugt_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10211,7 +10340,7 @@ ; ; AVX2-LABEL: ult_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10295,7 +10424,7 @@ ; ; AVX2-LABEL: ugt_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10379,7 +10508,7 @@ ; ; AVX2-LABEL: ult_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10463,7 +10592,7 @@ ; ; AVX2-LABEL: ugt_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10547,7 +10676,7 @@ ; ; AVX2-LABEL: ult_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10631,7 +10760,7 @@ ; ; AVX2-LABEL: ugt_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10715,7 +10844,7 @@ ; ; AVX2-LABEL: ult_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10799,7 +10928,7 @@ ; ; AVX2-LABEL: ugt_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10883,7 +11012,7 @@ ; ; AVX2-LABEL: ult_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10967,7 +11096,7 @@ ; ; AVX2-LABEL: ugt_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11051,7 +11180,7 @@ ; ; AVX2-LABEL: ult_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11135,7 +11264,7 @@ ; ; AVX2-LABEL: ugt_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11219,7 +11348,7 @@ ; ; AVX2-LABEL: ult_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11303,7 +11432,7 @@ ; ; AVX2-LABEL: ugt_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11387,7 +11516,7 @@ ; ; AVX2-LABEL: ult_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11471,7 +11600,7 @@ ; ; AVX2-LABEL: ugt_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11555,7 +11684,7 @@ ; ; AVX2-LABEL: ult_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11639,7 +11768,7 @@ ; ; AVX2-LABEL: ugt_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11723,7 +11852,7 @@ ; ; AVX2-LABEL: ult_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11807,7 +11936,7 @@ ; ; AVX2-LABEL: ugt_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11891,7 +12020,7 @@ ; ; AVX2-LABEL: ult_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11975,7 +12104,7 @@ ; ; AVX2-LABEL: ugt_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12059,7 +12188,7 @@ ; ; AVX2-LABEL: ult_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12143,7 +12272,7 @@ ; ; AVX2-LABEL: ugt_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12227,7 +12356,7 @@ ; ; AVX2-LABEL: ult_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12311,7 +12440,7 @@ ; ; AVX2-LABEL: ugt_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12395,7 +12524,7 @@ ; ; AVX2-LABEL: ult_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12479,7 +12608,7 @@ ; ; AVX2-LABEL: ugt_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12563,7 +12692,7 @@ ; ; AVX2-LABEL: ult_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12647,7 +12776,7 @@ ; ; AVX2-LABEL: ugt_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12731,7 +12860,7 @@ ; ; AVX2-LABEL: ult_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12815,7 +12944,7 @@ ; ; AVX2-LABEL: ugt_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12899,7 +13028,7 @@ ; ; AVX2-LABEL: ult_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12983,7 +13112,7 @@ ; ; AVX2-LABEL: ugt_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13067,7 +13196,7 @@ ; ; AVX2-LABEL: ult_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13151,7 +13280,7 @@ ; ; AVX2-LABEL: ugt_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13235,7 +13364,7 @@ ; ; AVX2-LABEL: ult_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13319,7 +13448,7 @@ ; ; AVX2-LABEL: ugt_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13403,7 +13532,7 @@ ; ; AVX2-LABEL: ult_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13487,7 +13616,7 @@ ; ; AVX2-LABEL: ugt_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13571,7 +13700,7 @@ ; ; AVX2-LABEL: ult_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13655,7 +13784,7 @@ ; ; AVX2-LABEL: ugt_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13739,7 +13868,7 @@ ; ; AVX2-LABEL: ult_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13823,7 +13952,7 @@ ; ; AVX2-LABEL: ugt_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13907,7 +14036,7 @@ ; ; AVX2-LABEL: ult_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13991,7 +14120,7 @@ ; ; AVX2-LABEL: ugt_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14075,7 +14204,7 @@ ; ; AVX2-LABEL: ult_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14159,7 +14288,7 @@ ; ; AVX2-LABEL: ugt_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14243,7 +14372,7 @@ ; ; AVX2-LABEL: ult_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14327,7 +14456,7 @@ ; ; AVX2-LABEL: ugt_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14411,7 +14540,7 @@ ; ; AVX2-LABEL: ult_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14495,7 +14624,7 @@ ; ; AVX2-LABEL: ugt_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14579,7 +14708,7 @@ ; ; AVX2-LABEL: ult_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14663,7 +14792,7 @@ ; ; AVX2-LABEL: ugt_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14747,7 +14876,7 @@ ; ; AVX2-LABEL: ult_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14831,7 +14960,7 @@ ; ; AVX2-LABEL: ugt_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14915,7 +15044,7 @@ ; ; AVX2-LABEL: ult_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14999,7 +15128,7 @@ ; ; AVX2-LABEL: ugt_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15083,7 +15212,7 @@ ; ; AVX2-LABEL: ult_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15167,7 +15296,7 @@ ; ; AVX2-LABEL: ugt_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15251,7 +15380,7 @@ ; ; AVX2-LABEL: ult_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15335,7 +15464,7 @@ ; ; AVX2-LABEL: ugt_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15419,7 +15548,7 @@ ; ; AVX2-LABEL: ult_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15503,7 +15632,7 @@ ; ; AVX2-LABEL: ugt_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15587,7 +15716,7 @@ ; ; AVX2-LABEL: ult_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15671,7 +15800,7 @@ ; ; AVX2-LABEL: ugt_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15755,7 +15884,7 @@ ; ; AVX2-LABEL: ult_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15839,7 +15968,7 @@ ; ; AVX2-LABEL: ugt_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15923,7 +16052,7 @@ ; ; AVX2-LABEL: ult_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16007,7 +16136,7 @@ ; ; AVX2-LABEL: ugt_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16091,7 +16220,7 @@ ; ; AVX2-LABEL: ult_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16175,7 +16304,7 @@ ; ; AVX2-LABEL: ugt_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16259,7 +16388,7 @@ ; ; AVX2-LABEL: ult_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16343,7 +16472,7 @@ ; ; AVX2-LABEL: ugt_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16427,7 +16556,7 @@ ; ; AVX2-LABEL: ult_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16511,7 +16640,7 @@ ; ; AVX2-LABEL: ugt_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16595,7 +16724,7 @@ ; ; AVX2-LABEL: ult_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16679,7 +16808,7 @@ ; ; AVX2-LABEL: ugt_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16763,7 +16892,7 @@ ; ; AVX2-LABEL: ult_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16847,7 +16976,7 @@ ; ; AVX2-LABEL: ugt_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16931,7 +17060,7 @@ ; ; AVX2-LABEL: ult_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17015,7 +17144,7 @@ ; ; AVX2-LABEL: ugt_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17099,7 +17228,7 @@ ; ; AVX2-LABEL: ult_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17183,7 +17312,7 @@ ; ; AVX2-LABEL: ugt_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17267,7 +17396,7 @@ ; ; AVX2-LABEL: ult_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17351,7 +17480,7 @@ ; ; AVX2-LABEL: ugt_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17435,7 +17564,7 @@ ; ; AVX2-LABEL: ult_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17519,7 +17648,7 @@ ; ; AVX2-LABEL: ugt_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17603,7 +17732,7 @@ ; ; AVX2-LABEL: ult_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17687,7 +17816,7 @@ ; ; AVX2-LABEL: ugt_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17771,7 +17900,7 @@ ; ; AVX2-LABEL: ult_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17855,7 +17984,7 @@ ; ; AVX2-LABEL: ugt_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17939,7 +18068,7 @@ ; ; AVX2-LABEL: ult_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18023,7 +18152,7 @@ ; ; AVX2-LABEL: ugt_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18107,7 +18236,7 @@ ; ; AVX2-LABEL: ult_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18191,7 +18320,7 @@ ; ; AVX2-LABEL: ugt_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18275,7 +18404,7 @@ ; ; AVX2-LABEL: ult_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18359,7 +18488,7 @@ ; ; AVX2-LABEL: ugt_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18443,7 +18572,7 @@ ; ; AVX2-LABEL: ult_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18527,7 +18656,7 @@ ; ; AVX2-LABEL: ugt_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18611,7 +18740,7 @@ ; ; AVX2-LABEL: ult_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18695,7 +18824,7 @@ ; ; AVX2-LABEL: ugt_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18779,7 +18908,7 @@ ; ; AVX2-LABEL: ult_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18863,7 +18992,7 @@ ; ; AVX2-LABEL: ugt_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18947,7 +19076,7 @@ ; ; AVX2-LABEL: ult_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19031,7 +19160,7 @@ ; ; AVX2-LABEL: ugt_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19115,7 +19244,7 @@ ; ; AVX2-LABEL: ult_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19199,7 +19328,7 @@ ; ; AVX2-LABEL: ugt_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19283,7 +19412,7 @@ ; ; AVX2-LABEL: ult_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19367,7 +19496,7 @@ ; ; AVX2-LABEL: ugt_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19451,7 +19580,7 @@ ; ; AVX2-LABEL: ult_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19535,7 +19664,7 @@ ; ; AVX2-LABEL: ugt_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19619,7 +19748,7 @@ ; ; AVX2-LABEL: ult_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19703,7 +19832,7 @@ ; ; AVX2-LABEL: ugt_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19787,7 +19916,7 @@ ; ; AVX2-LABEL: ult_63_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -33,7 +33,7 @@ ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -131,7 +131,7 @@ ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -244,7 +244,7 @@ ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -334,7 +334,7 @@ ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -365,7 +365,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -377,7 +377,7 @@ ; ; AVX512VPOPCNTDQVL-LABEL: testv32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -886,13 +886,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) %3 = icmp eq <16 x i16> %2, @@ -978,7 +980,8 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq @@ -986,7 +989,8 @@ ; BITALG-LABEL: ne_1_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1069,13 +1073,15 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) %3 = icmp eq <32 x i8> %2, @@ -1161,7 +1167,8 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq @@ -1169,7 +1176,8 @@ ; BITALG-LABEL: ne_1_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpternlogq $15, %ymm0, %ymm0, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -56,7 +56,8 @@ ; BITALG-LABEL: ugt_1_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -113,7 +114,8 @@ ; BITALG-LABEL: ult_2_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -125,7 +127,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_2_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -140,7 +142,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -150,7 +152,7 @@ ; ; AVX512BW-LABEL: ugt_2_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -158,13 +160,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -179,7 +182,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -189,7 +192,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -197,14 +200,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_2_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -216,7 +221,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -231,7 +236,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -241,7 +246,7 @@ ; ; AVX512BW-LABEL: ult_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -249,13 +254,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -270,7 +276,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -280,7 +286,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -288,14 +294,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_3_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -307,7 +315,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -322,7 +330,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -332,7 +340,7 @@ ; ; AVX512BW-LABEL: ugt_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -340,13 +348,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -361,7 +370,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -371,7 +380,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -379,14 +388,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_3_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -398,7 +409,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -413,7 +424,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -423,7 +434,7 @@ ; ; AVX512BW-LABEL: ult_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -431,13 +442,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -452,7 +464,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -462,7 +474,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -470,14 +482,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_4_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -489,7 +503,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -504,7 +518,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -514,7 +528,7 @@ ; ; AVX512BW-LABEL: ugt_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -522,13 +536,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -543,7 +558,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -553,7 +568,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -561,14 +576,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_4_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -580,7 +597,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -595,7 +612,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -605,7 +622,7 @@ ; ; AVX512BW-LABEL: ult_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -613,13 +630,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -634,7 +652,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -644,7 +662,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -652,14 +670,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_5_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -671,7 +691,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -686,7 +706,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -696,7 +716,7 @@ ; ; AVX512BW-LABEL: ugt_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -704,13 +724,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -725,7 +746,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -735,7 +756,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -743,14 +764,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_5_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -762,7 +785,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -777,7 +800,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -787,7 +810,7 @@ ; ; AVX512BW-LABEL: ult_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -795,13 +818,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -816,7 +840,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -826,7 +850,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -834,14 +858,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_6_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -853,7 +879,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -868,7 +894,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -878,7 +904,7 @@ ; ; AVX512BW-LABEL: ugt_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -886,13 +912,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -907,7 +934,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -917,7 +944,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -925,14 +952,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_6_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -944,7 +973,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_7_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -959,7 +988,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -969,7 +998,7 @@ ; ; AVX512BW-LABEL: ult_7_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -977,13 +1006,14 @@ ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -998,7 +1028,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm0, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpminub %ymm1, %ymm2, %ymm1 @@ -1008,7 +1038,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1016,14 +1046,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_7_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltub %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -1082,7 +1114,8 @@ ; BITALG-LABEL: ugt_1_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1139,7 +1172,8 @@ ; BITALG-LABEL: ult_2_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1151,7 +1185,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_2_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1172,7 +1206,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1180,7 +1214,7 @@ ; ; AVX512BW-LABEL: ugt_2_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1191,7 +1225,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1204,7 +1239,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1212,7 +1247,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1223,14 +1258,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_2_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1242,7 +1279,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1263,7 +1300,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1271,7 +1308,7 @@ ; ; AVX512BW-LABEL: ult_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1282,7 +1319,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1295,7 +1333,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1303,7 +1341,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1314,14 +1352,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_3_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1333,7 +1373,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1354,7 +1394,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1362,7 +1402,7 @@ ; ; AVX512BW-LABEL: ugt_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1373,7 +1413,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1386,7 +1427,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1394,7 +1435,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1405,14 +1446,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_3_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1424,7 +1467,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1445,7 +1488,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1453,7 +1496,7 @@ ; ; AVX512BW-LABEL: ult_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1464,7 +1507,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1477,7 +1521,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1485,7 +1529,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1496,14 +1540,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_4_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1515,7 +1561,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1536,7 +1582,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1544,7 +1590,7 @@ ; ; AVX512BW-LABEL: ugt_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1555,7 +1601,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1568,7 +1615,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1576,7 +1623,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1587,14 +1634,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_4_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1606,7 +1655,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1627,7 +1676,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1635,7 +1684,7 @@ ; ; AVX512BW-LABEL: ult_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1646,7 +1695,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1659,7 +1709,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1667,7 +1717,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1678,14 +1728,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_5_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1697,7 +1749,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1718,7 +1770,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1726,7 +1778,7 @@ ; ; AVX512BW-LABEL: ugt_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1737,7 +1789,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1750,7 +1803,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1758,7 +1811,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1769,14 +1822,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_5_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1788,7 +1843,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1809,7 +1864,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1817,7 +1872,7 @@ ; ; AVX512BW-LABEL: ult_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1828,7 +1883,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1841,7 +1897,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1849,7 +1905,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1860,14 +1916,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_6_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1879,7 +1937,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1900,7 +1958,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1908,7 +1966,7 @@ ; ; AVX512BW-LABEL: ugt_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1919,7 +1977,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1932,7 +1991,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1940,7 +1999,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -1951,14 +2010,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_6_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -1970,7 +2031,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1991,7 +2052,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1999,7 +2060,7 @@ ; ; AVX512BW-LABEL: ult_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2010,7 +2071,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2023,7 +2085,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2031,7 +2093,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2042,14 +2104,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_7_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2061,7 +2125,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2082,7 +2146,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2090,7 +2154,7 @@ ; ; AVX512BW-LABEL: ugt_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2101,7 +2165,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2114,7 +2179,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2122,7 +2187,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2133,14 +2198,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_7_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2152,7 +2219,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2173,7 +2240,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2181,7 +2248,7 @@ ; ; AVX512BW-LABEL: ult_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2192,7 +2259,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2205,7 +2273,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2213,7 +2281,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2224,14 +2292,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_8_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2243,7 +2313,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2264,7 +2334,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2272,7 +2342,7 @@ ; ; AVX512BW-LABEL: ugt_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2283,7 +2353,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2296,7 +2367,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2304,7 +2375,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2315,14 +2386,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_8_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2334,7 +2407,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2355,7 +2428,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2363,7 +2436,7 @@ ; ; AVX512BW-LABEL: ult_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2374,7 +2447,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2387,7 +2461,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2395,7 +2469,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2406,14 +2480,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_9_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2425,7 +2501,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2446,7 +2522,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2454,7 +2530,7 @@ ; ; AVX512BW-LABEL: ugt_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2465,7 +2541,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2478,7 +2555,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2486,7 +2563,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2497,14 +2574,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_9_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2516,7 +2595,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2537,7 +2616,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2545,7 +2624,7 @@ ; ; AVX512BW-LABEL: ult_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2556,7 +2635,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2569,7 +2649,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2577,7 +2657,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2588,14 +2668,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_10_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2607,7 +2689,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2628,7 +2710,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2636,7 +2718,7 @@ ; ; AVX512BW-LABEL: ugt_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2647,7 +2729,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2660,7 +2743,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2668,7 +2751,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2679,14 +2762,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_10_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2698,7 +2783,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2719,7 +2804,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2727,7 +2812,7 @@ ; ; AVX512BW-LABEL: ult_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2738,7 +2823,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2751,7 +2837,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2759,7 +2845,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2770,14 +2856,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_11_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2789,7 +2877,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2810,7 +2898,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2818,7 +2906,7 @@ ; ; AVX512BW-LABEL: ugt_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2829,7 +2917,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2842,7 +2931,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2850,7 +2939,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2861,14 +2950,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_11_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2880,7 +2971,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2901,7 +2992,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2909,7 +3000,7 @@ ; ; AVX512BW-LABEL: ult_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2920,7 +3011,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -2933,7 +3025,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2941,7 +3033,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -2952,14 +3044,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_12_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -2971,7 +3065,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2992,7 +3086,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3000,7 +3094,7 @@ ; ; AVX512BW-LABEL: ugt_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3011,7 +3105,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3024,7 +3119,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3032,7 +3127,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3043,14 +3138,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_12_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3062,7 +3159,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3083,7 +3180,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3091,7 +3188,7 @@ ; ; AVX512BW-LABEL: ult_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3102,7 +3199,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3115,7 +3213,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3123,7 +3221,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3134,14 +3232,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_13_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3153,7 +3253,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3174,7 +3274,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3182,7 +3282,7 @@ ; ; AVX512BW-LABEL: ugt_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3193,7 +3293,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3206,7 +3307,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3214,7 +3315,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3225,14 +3326,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_13_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3244,7 +3347,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3265,7 +3368,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3273,7 +3376,7 @@ ; ; AVX512BW-LABEL: ult_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3284,7 +3387,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3297,7 +3401,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3305,7 +3409,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3316,14 +3420,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_14_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3335,7 +3441,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3356,7 +3462,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3364,7 +3470,7 @@ ; ; AVX512BW-LABEL: ugt_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3375,7 +3481,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3388,7 +3495,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3396,7 +3503,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3407,14 +3514,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ugt_14_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3426,7 +3535,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_15_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3447,7 +3556,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3455,7 +3564,7 @@ ; ; AVX512BW-LABEL: ult_15_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3466,7 +3575,8 @@ ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -3479,7 +3589,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3487,7 +3597,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_15_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3498,14 +3608,16 @@ ; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 ; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; AVX512VPOPCNTDQ-BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: retq ; ; BITALG-LABEL: ult_15_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpltuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -3592,7 +3704,7 @@ ; AVX512F-LABEL: ugt_2_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3624,7 +3736,7 @@ ; ; AVX512BW-LABEL: ugt_2_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3671,7 +3783,7 @@ ; AVX512F-LABEL: ult_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3703,7 +3815,7 @@ ; ; AVX512BW-LABEL: ult_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3750,7 +3862,7 @@ ; AVX512F-LABEL: ugt_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3782,7 +3894,7 @@ ; ; AVX512BW-LABEL: ugt_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3829,7 +3941,7 @@ ; AVX512F-LABEL: ult_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3861,7 +3973,7 @@ ; ; AVX512BW-LABEL: ult_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3908,7 +4020,7 @@ ; AVX512F-LABEL: ugt_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3940,7 +4052,7 @@ ; ; AVX512BW-LABEL: ugt_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -3987,7 +4099,7 @@ ; AVX512F-LABEL: ult_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4019,7 +4131,7 @@ ; ; AVX512BW-LABEL: ult_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4066,7 +4178,7 @@ ; AVX512F-LABEL: ugt_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4098,7 +4210,7 @@ ; ; AVX512BW-LABEL: ugt_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4145,7 +4257,7 @@ ; AVX512F-LABEL: ult_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4177,7 +4289,7 @@ ; ; AVX512BW-LABEL: ult_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4224,7 +4336,7 @@ ; AVX512F-LABEL: ugt_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4256,7 +4368,7 @@ ; ; AVX512BW-LABEL: ugt_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4303,7 +4415,7 @@ ; AVX512F-LABEL: ult_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4335,7 +4447,7 @@ ; ; AVX512BW-LABEL: ult_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4382,7 +4494,7 @@ ; AVX512F-LABEL: ugt_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4414,7 +4526,7 @@ ; ; AVX512BW-LABEL: ugt_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4461,7 +4573,7 @@ ; AVX512F-LABEL: ult_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4493,7 +4605,7 @@ ; ; AVX512BW-LABEL: ult_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4540,7 +4652,7 @@ ; AVX512F-LABEL: ugt_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4572,7 +4684,7 @@ ; ; AVX512BW-LABEL: ugt_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4619,7 +4731,7 @@ ; AVX512F-LABEL: ult_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4651,7 +4763,7 @@ ; ; AVX512BW-LABEL: ult_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4698,7 +4810,7 @@ ; AVX512F-LABEL: ugt_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4730,7 +4842,7 @@ ; ; AVX512BW-LABEL: ugt_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4777,7 +4889,7 @@ ; AVX512F-LABEL: ult_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4809,7 +4921,7 @@ ; ; AVX512BW-LABEL: ult_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4856,7 +4968,7 @@ ; AVX512F-LABEL: ugt_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4888,7 +5000,7 @@ ; ; AVX512BW-LABEL: ugt_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -4935,7 +5047,7 @@ ; AVX512F-LABEL: ult_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4967,7 +5079,7 @@ ; ; AVX512BW-LABEL: ult_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5014,7 +5126,7 @@ ; AVX512F-LABEL: ugt_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5046,7 +5158,7 @@ ; ; AVX512BW-LABEL: ugt_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5093,7 +5205,7 @@ ; AVX512F-LABEL: ult_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5125,7 +5237,7 @@ ; ; AVX512BW-LABEL: ult_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5172,7 +5284,7 @@ ; AVX512F-LABEL: ugt_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5204,7 +5316,7 @@ ; ; AVX512BW-LABEL: ugt_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5251,7 +5363,7 @@ ; AVX512F-LABEL: ult_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5283,7 +5395,7 @@ ; ; AVX512BW-LABEL: ult_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5330,7 +5442,7 @@ ; AVX512F-LABEL: ugt_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5362,7 +5474,7 @@ ; ; AVX512BW-LABEL: ugt_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5409,7 +5521,7 @@ ; AVX512F-LABEL: ult_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5441,7 +5553,7 @@ ; ; AVX512BW-LABEL: ult_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5488,7 +5600,7 @@ ; AVX512F-LABEL: ugt_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5520,7 +5632,7 @@ ; ; AVX512BW-LABEL: ugt_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5567,7 +5679,7 @@ ; AVX512F-LABEL: ult_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5599,7 +5711,7 @@ ; ; AVX512BW-LABEL: ult_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5646,7 +5758,7 @@ ; AVX512F-LABEL: ugt_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5678,7 +5790,7 @@ ; ; AVX512BW-LABEL: ugt_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5725,7 +5837,7 @@ ; AVX512F-LABEL: ult_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5757,7 +5869,7 @@ ; ; AVX512BW-LABEL: ult_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5804,7 +5916,7 @@ ; AVX512F-LABEL: ugt_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5836,7 +5948,7 @@ ; ; AVX512BW-LABEL: ugt_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5883,7 +5995,7 @@ ; AVX512F-LABEL: ult_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5915,7 +6027,7 @@ ; ; AVX512BW-LABEL: ult_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -5962,7 +6074,7 @@ ; AVX512F-LABEL: ugt_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5994,7 +6106,7 @@ ; ; AVX512BW-LABEL: ugt_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6041,7 +6153,7 @@ ; AVX512F-LABEL: ult_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6073,7 +6185,7 @@ ; ; AVX512BW-LABEL: ult_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6120,7 +6232,7 @@ ; AVX512F-LABEL: ugt_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6152,7 +6264,7 @@ ; ; AVX512BW-LABEL: ugt_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6199,7 +6311,7 @@ ; AVX512F-LABEL: ult_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6231,7 +6343,7 @@ ; ; AVX512BW-LABEL: ult_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6278,7 +6390,7 @@ ; AVX512F-LABEL: ugt_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6310,7 +6422,7 @@ ; ; AVX512BW-LABEL: ugt_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6357,7 +6469,7 @@ ; AVX512F-LABEL: ult_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6389,7 +6501,7 @@ ; ; AVX512BW-LABEL: ult_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6436,7 +6548,7 @@ ; AVX512F-LABEL: ugt_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6468,7 +6580,7 @@ ; ; AVX512BW-LABEL: ugt_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6515,7 +6627,7 @@ ; AVX512F-LABEL: ult_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6547,7 +6659,7 @@ ; ; AVX512BW-LABEL: ult_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6594,7 +6706,7 @@ ; AVX512F-LABEL: ugt_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6626,7 +6738,7 @@ ; ; AVX512BW-LABEL: ugt_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6673,7 +6785,7 @@ ; AVX512F-LABEL: ult_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6705,7 +6817,7 @@ ; ; AVX512BW-LABEL: ult_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6752,7 +6864,7 @@ ; AVX512F-LABEL: ugt_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6784,7 +6896,7 @@ ; ; AVX512BW-LABEL: ugt_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6831,7 +6943,7 @@ ; AVX512F-LABEL: ult_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6863,7 +6975,7 @@ ; ; AVX512BW-LABEL: ult_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6910,7 +7022,7 @@ ; AVX512F-LABEL: ugt_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6942,7 +7054,7 @@ ; ; AVX512BW-LABEL: ugt_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -6989,7 +7101,7 @@ ; AVX512F-LABEL: ult_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7021,7 +7133,7 @@ ; ; AVX512BW-LABEL: ult_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7068,7 +7180,7 @@ ; AVX512F-LABEL: ugt_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7100,7 +7212,7 @@ ; ; AVX512BW-LABEL: ugt_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7147,7 +7259,7 @@ ; AVX512F-LABEL: ult_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7179,7 +7291,7 @@ ; ; AVX512BW-LABEL: ult_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7226,7 +7338,7 @@ ; AVX512F-LABEL: ugt_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7258,7 +7370,7 @@ ; ; AVX512BW-LABEL: ugt_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7305,7 +7417,7 @@ ; AVX512F-LABEL: ult_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7337,7 +7449,7 @@ ; ; AVX512BW-LABEL: ult_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7384,7 +7496,7 @@ ; AVX512F-LABEL: ugt_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7416,7 +7528,7 @@ ; ; AVX512BW-LABEL: ugt_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7463,7 +7575,7 @@ ; AVX512F-LABEL: ult_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7495,7 +7607,7 @@ ; ; AVX512BW-LABEL: ult_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7542,7 +7654,7 @@ ; AVX512F-LABEL: ugt_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7574,7 +7686,7 @@ ; ; AVX512BW-LABEL: ugt_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7621,7 +7733,7 @@ ; AVX512F-LABEL: ult_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7653,7 +7765,7 @@ ; ; AVX512BW-LABEL: ult_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7700,7 +7812,7 @@ ; AVX512F-LABEL: ugt_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7732,7 +7844,7 @@ ; ; AVX512BW-LABEL: ugt_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7779,7 +7891,7 @@ ; AVX512F-LABEL: ult_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7811,7 +7923,7 @@ ; ; AVX512BW-LABEL: ult_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7858,7 +7970,7 @@ ; AVX512F-LABEL: ugt_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7890,7 +8002,7 @@ ; ; AVX512BW-LABEL: ugt_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -7937,7 +8049,7 @@ ; AVX512F-LABEL: ult_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7969,7 +8081,7 @@ ; ; AVX512BW-LABEL: ult_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8016,7 +8128,7 @@ ; AVX512F-LABEL: ugt_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8048,7 +8160,7 @@ ; ; AVX512BW-LABEL: ugt_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8095,7 +8207,7 @@ ; AVX512F-LABEL: ult_31_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8127,7 +8239,7 @@ ; ; AVX512BW-LABEL: ult_31_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8248,7 +8360,7 @@ ; AVX512F-LABEL: ugt_2_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8272,7 +8384,7 @@ ; ; AVX512BW-LABEL: ugt_2_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8311,7 +8423,7 @@ ; AVX512F-LABEL: ult_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8335,7 +8447,7 @@ ; ; AVX512BW-LABEL: ult_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8374,7 +8486,7 @@ ; AVX512F-LABEL: ugt_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8398,7 +8510,7 @@ ; ; AVX512BW-LABEL: ugt_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8437,7 +8549,7 @@ ; AVX512F-LABEL: ult_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8461,7 +8573,7 @@ ; ; AVX512BW-LABEL: ult_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8500,7 +8612,7 @@ ; AVX512F-LABEL: ugt_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8524,7 +8636,7 @@ ; ; AVX512BW-LABEL: ugt_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8563,7 +8675,7 @@ ; AVX512F-LABEL: ult_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8587,7 +8699,7 @@ ; ; AVX512BW-LABEL: ult_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8626,7 +8738,7 @@ ; AVX512F-LABEL: ugt_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8650,7 +8762,7 @@ ; ; AVX512BW-LABEL: ugt_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8689,7 +8801,7 @@ ; AVX512F-LABEL: ult_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8713,7 +8825,7 @@ ; ; AVX512BW-LABEL: ult_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8752,7 +8864,7 @@ ; AVX512F-LABEL: ugt_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8776,7 +8888,7 @@ ; ; AVX512BW-LABEL: ugt_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8815,7 +8927,7 @@ ; AVX512F-LABEL: ult_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8839,7 +8951,7 @@ ; ; AVX512BW-LABEL: ult_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8878,7 +8990,7 @@ ; AVX512F-LABEL: ugt_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8902,7 +9014,7 @@ ; ; AVX512BW-LABEL: ugt_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -8941,7 +9053,7 @@ ; AVX512F-LABEL: ult_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8965,7 +9077,7 @@ ; ; AVX512BW-LABEL: ult_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9004,7 +9116,7 @@ ; AVX512F-LABEL: ugt_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9028,7 +9140,7 @@ ; ; AVX512BW-LABEL: ugt_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9067,7 +9179,7 @@ ; AVX512F-LABEL: ult_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9091,7 +9203,7 @@ ; ; AVX512BW-LABEL: ult_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9130,7 +9242,7 @@ ; AVX512F-LABEL: ugt_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9154,7 +9266,7 @@ ; ; AVX512BW-LABEL: ugt_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9193,7 +9305,7 @@ ; AVX512F-LABEL: ult_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9217,7 +9329,7 @@ ; ; AVX512BW-LABEL: ult_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9256,7 +9368,7 @@ ; AVX512F-LABEL: ugt_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9280,7 +9392,7 @@ ; ; AVX512BW-LABEL: ugt_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9319,7 +9431,7 @@ ; AVX512F-LABEL: ult_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9343,7 +9455,7 @@ ; ; AVX512BW-LABEL: ult_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9382,7 +9494,7 @@ ; AVX512F-LABEL: ugt_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9406,7 +9518,7 @@ ; ; AVX512BW-LABEL: ugt_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9445,7 +9557,7 @@ ; AVX512F-LABEL: ult_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9469,7 +9581,7 @@ ; ; AVX512BW-LABEL: ult_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9508,7 +9620,7 @@ ; AVX512F-LABEL: ugt_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9532,7 +9644,7 @@ ; ; AVX512BW-LABEL: ugt_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9571,7 +9683,7 @@ ; AVX512F-LABEL: ult_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9595,7 +9707,7 @@ ; ; AVX512BW-LABEL: ult_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9634,7 +9746,7 @@ ; AVX512F-LABEL: ugt_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9658,7 +9770,7 @@ ; ; AVX512BW-LABEL: ugt_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9697,7 +9809,7 @@ ; AVX512F-LABEL: ult_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9721,7 +9833,7 @@ ; ; AVX512BW-LABEL: ult_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9760,7 +9872,7 @@ ; AVX512F-LABEL: ugt_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9784,7 +9896,7 @@ ; ; AVX512BW-LABEL: ugt_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9823,7 +9935,7 @@ ; AVX512F-LABEL: ult_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9847,7 +9959,7 @@ ; ; AVX512BW-LABEL: ult_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9886,7 +9998,7 @@ ; AVX512F-LABEL: ugt_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9910,7 +10022,7 @@ ; ; AVX512BW-LABEL: ugt_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -9949,7 +10061,7 @@ ; AVX512F-LABEL: ult_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9973,7 +10085,7 @@ ; ; AVX512BW-LABEL: ult_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10012,7 +10124,7 @@ ; AVX512F-LABEL: ugt_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10036,7 +10148,7 @@ ; ; AVX512BW-LABEL: ugt_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10075,7 +10187,7 @@ ; AVX512F-LABEL: ult_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10099,7 +10211,7 @@ ; ; AVX512BW-LABEL: ult_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10138,7 +10250,7 @@ ; AVX512F-LABEL: ugt_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10162,7 +10274,7 @@ ; ; AVX512BW-LABEL: ugt_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10201,7 +10313,7 @@ ; AVX512F-LABEL: ult_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10225,7 +10337,7 @@ ; ; AVX512BW-LABEL: ult_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10264,7 +10376,7 @@ ; AVX512F-LABEL: ugt_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10288,7 +10400,7 @@ ; ; AVX512BW-LABEL: ugt_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10327,7 +10439,7 @@ ; AVX512F-LABEL: ult_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10351,7 +10463,7 @@ ; ; AVX512BW-LABEL: ult_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10390,7 +10502,7 @@ ; AVX512F-LABEL: ugt_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10414,7 +10526,7 @@ ; ; AVX512BW-LABEL: ugt_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10453,7 +10565,7 @@ ; AVX512F-LABEL: ult_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10477,7 +10589,7 @@ ; ; AVX512BW-LABEL: ult_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10516,7 +10628,7 @@ ; AVX512F-LABEL: ugt_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10540,7 +10652,7 @@ ; ; AVX512BW-LABEL: ugt_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10579,7 +10691,7 @@ ; AVX512F-LABEL: ult_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10603,7 +10715,7 @@ ; ; AVX512BW-LABEL: ult_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10642,7 +10754,7 @@ ; AVX512F-LABEL: ugt_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10666,7 +10778,7 @@ ; ; AVX512BW-LABEL: ugt_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10705,7 +10817,7 @@ ; AVX512F-LABEL: ult_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10729,7 +10841,7 @@ ; ; AVX512BW-LABEL: ult_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10768,7 +10880,7 @@ ; AVX512F-LABEL: ugt_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10792,7 +10904,7 @@ ; ; AVX512BW-LABEL: ugt_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10831,7 +10943,7 @@ ; AVX512F-LABEL: ult_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10855,7 +10967,7 @@ ; ; AVX512BW-LABEL: ult_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10894,7 +11006,7 @@ ; AVX512F-LABEL: ugt_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10918,7 +11030,7 @@ ; ; AVX512BW-LABEL: ugt_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -10957,7 +11069,7 @@ ; AVX512F-LABEL: ult_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10981,7 +11093,7 @@ ; ; AVX512BW-LABEL: ult_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11020,7 +11132,7 @@ ; AVX512F-LABEL: ugt_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11044,7 +11156,7 @@ ; ; AVX512BW-LABEL: ugt_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11083,7 +11195,7 @@ ; AVX512F-LABEL: ult_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11107,7 +11219,7 @@ ; ; AVX512BW-LABEL: ult_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11146,7 +11258,7 @@ ; AVX512F-LABEL: ugt_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11170,7 +11282,7 @@ ; ; AVX512BW-LABEL: ugt_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11209,7 +11321,7 @@ ; AVX512F-LABEL: ult_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11233,7 +11345,7 @@ ; ; AVX512BW-LABEL: ult_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11272,7 +11384,7 @@ ; AVX512F-LABEL: ugt_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11296,7 +11408,7 @@ ; ; AVX512BW-LABEL: ugt_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11335,7 +11447,7 @@ ; AVX512F-LABEL: ult_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11359,7 +11471,7 @@ ; ; AVX512BW-LABEL: ult_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11398,7 +11510,7 @@ ; AVX512F-LABEL: ugt_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11422,7 +11534,7 @@ ; ; AVX512BW-LABEL: ugt_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11461,7 +11573,7 @@ ; AVX512F-LABEL: ult_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11485,7 +11597,7 @@ ; ; AVX512BW-LABEL: ult_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11524,7 +11636,7 @@ ; AVX512F-LABEL: ugt_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11548,7 +11660,7 @@ ; ; AVX512BW-LABEL: ugt_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11587,7 +11699,7 @@ ; AVX512F-LABEL: ult_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11611,7 +11723,7 @@ ; ; AVX512BW-LABEL: ult_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11650,7 +11762,7 @@ ; AVX512F-LABEL: ugt_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11674,7 +11786,7 @@ ; ; AVX512BW-LABEL: ugt_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11713,7 +11825,7 @@ ; AVX512F-LABEL: ult_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11737,7 +11849,7 @@ ; ; AVX512BW-LABEL: ult_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11776,7 +11888,7 @@ ; AVX512F-LABEL: ugt_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11800,7 +11912,7 @@ ; ; AVX512BW-LABEL: ugt_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11839,7 +11951,7 @@ ; AVX512F-LABEL: ult_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11863,7 +11975,7 @@ ; ; AVX512BW-LABEL: ult_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11902,7 +12014,7 @@ ; AVX512F-LABEL: ugt_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11926,7 +12038,7 @@ ; ; AVX512BW-LABEL: ugt_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -11965,7 +12077,7 @@ ; AVX512F-LABEL: ult_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11989,7 +12101,7 @@ ; ; AVX512BW-LABEL: ult_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12028,7 +12140,7 @@ ; AVX512F-LABEL: ugt_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12052,7 +12164,7 @@ ; ; AVX512BW-LABEL: ugt_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12091,7 +12203,7 @@ ; AVX512F-LABEL: ult_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12115,7 +12227,7 @@ ; ; AVX512BW-LABEL: ult_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12154,7 +12266,7 @@ ; AVX512F-LABEL: ugt_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12178,7 +12290,7 @@ ; ; AVX512BW-LABEL: ugt_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12217,7 +12329,7 @@ ; AVX512F-LABEL: ult_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12241,7 +12353,7 @@ ; ; AVX512BW-LABEL: ult_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12280,7 +12392,7 @@ ; AVX512F-LABEL: ugt_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12304,7 +12416,7 @@ ; ; AVX512BW-LABEL: ugt_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12343,7 +12455,7 @@ ; AVX512F-LABEL: ult_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12367,7 +12479,7 @@ ; ; AVX512BW-LABEL: ult_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12406,7 +12518,7 @@ ; AVX512F-LABEL: ugt_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12430,7 +12542,7 @@ ; ; AVX512BW-LABEL: ugt_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12469,7 +12581,7 @@ ; AVX512F-LABEL: ult_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12493,7 +12605,7 @@ ; ; AVX512BW-LABEL: ult_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12532,7 +12644,7 @@ ; AVX512F-LABEL: ugt_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12556,7 +12668,7 @@ ; ; AVX512BW-LABEL: ugt_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12595,7 +12707,7 @@ ; AVX512F-LABEL: ult_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12619,7 +12731,7 @@ ; ; AVX512BW-LABEL: ult_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12658,7 +12770,7 @@ ; AVX512F-LABEL: ugt_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12682,7 +12794,7 @@ ; ; AVX512BW-LABEL: ugt_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12721,7 +12833,7 @@ ; AVX512F-LABEL: ult_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12745,7 +12857,7 @@ ; ; AVX512BW-LABEL: ult_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12784,7 +12896,7 @@ ; AVX512F-LABEL: ugt_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12808,7 +12920,7 @@ ; ; AVX512BW-LABEL: ugt_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12847,7 +12959,7 @@ ; AVX512F-LABEL: ult_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12871,7 +12983,7 @@ ; ; AVX512BW-LABEL: ult_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12910,7 +13022,7 @@ ; AVX512F-LABEL: ugt_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12934,7 +13046,7 @@ ; ; AVX512BW-LABEL: ugt_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -12973,7 +13085,7 @@ ; AVX512F-LABEL: ult_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12997,7 +13109,7 @@ ; ; AVX512BW-LABEL: ult_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13036,7 +13148,7 @@ ; AVX512F-LABEL: ugt_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13060,7 +13172,7 @@ ; ; AVX512BW-LABEL: ugt_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13099,7 +13211,7 @@ ; AVX512F-LABEL: ult_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13123,7 +13235,7 @@ ; ; AVX512BW-LABEL: ult_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13162,7 +13274,7 @@ ; AVX512F-LABEL: ugt_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13186,7 +13298,7 @@ ; ; AVX512BW-LABEL: ugt_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13225,7 +13337,7 @@ ; AVX512F-LABEL: ult_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13249,7 +13361,7 @@ ; ; AVX512BW-LABEL: ult_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13288,7 +13400,7 @@ ; AVX512F-LABEL: ugt_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13312,7 +13424,7 @@ ; ; AVX512BW-LABEL: ugt_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13351,7 +13463,7 @@ ; AVX512F-LABEL: ult_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13375,7 +13487,7 @@ ; ; AVX512BW-LABEL: ult_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13414,7 +13526,7 @@ ; AVX512F-LABEL: ugt_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13438,7 +13550,7 @@ ; ; AVX512BW-LABEL: ugt_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13477,7 +13589,7 @@ ; AVX512F-LABEL: ult_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13501,7 +13613,7 @@ ; ; AVX512BW-LABEL: ult_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13540,7 +13652,7 @@ ; AVX512F-LABEL: ugt_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13564,7 +13676,7 @@ ; ; AVX512BW-LABEL: ugt_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13603,7 +13715,7 @@ ; AVX512F-LABEL: ult_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13627,7 +13739,7 @@ ; ; AVX512BW-LABEL: ult_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13666,7 +13778,7 @@ ; AVX512F-LABEL: ugt_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13690,7 +13802,7 @@ ; ; AVX512BW-LABEL: ugt_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13729,7 +13841,7 @@ ; AVX512F-LABEL: ult_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13753,7 +13865,7 @@ ; ; AVX512BW-LABEL: ult_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13792,7 +13904,7 @@ ; AVX512F-LABEL: ugt_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13816,7 +13928,7 @@ ; ; AVX512BW-LABEL: ugt_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13855,7 +13967,7 @@ ; AVX512F-LABEL: ult_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13879,7 +13991,7 @@ ; ; AVX512BW-LABEL: ult_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13918,7 +14030,7 @@ ; AVX512F-LABEL: ugt_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13942,7 +14054,7 @@ ; ; AVX512BW-LABEL: ugt_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -13981,7 +14093,7 @@ ; AVX512F-LABEL: ult_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14005,7 +14117,7 @@ ; ; AVX512BW-LABEL: ult_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14044,7 +14156,7 @@ ; AVX512F-LABEL: ugt_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14068,7 +14180,7 @@ ; ; AVX512BW-LABEL: ugt_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14107,7 +14219,7 @@ ; AVX512F-LABEL: ult_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14131,7 +14243,7 @@ ; ; AVX512BW-LABEL: ult_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14170,7 +14282,7 @@ ; AVX512F-LABEL: ugt_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14194,7 +14306,7 @@ ; ; AVX512BW-LABEL: ugt_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14233,7 +14345,7 @@ ; AVX512F-LABEL: ult_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14257,7 +14369,7 @@ ; ; AVX512BW-LABEL: ult_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14296,7 +14408,7 @@ ; AVX512F-LABEL: ugt_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14320,7 +14432,7 @@ ; ; AVX512BW-LABEL: ugt_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14359,7 +14471,7 @@ ; AVX512F-LABEL: ult_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14383,7 +14495,7 @@ ; ; AVX512BW-LABEL: ult_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14422,7 +14534,7 @@ ; AVX512F-LABEL: ugt_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14446,7 +14558,7 @@ ; ; AVX512BW-LABEL: ugt_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14485,7 +14597,7 @@ ; AVX512F-LABEL: ult_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14509,7 +14621,7 @@ ; ; AVX512BW-LABEL: ult_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14548,7 +14660,7 @@ ; AVX512F-LABEL: ugt_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14572,7 +14684,7 @@ ; ; AVX512BW-LABEL: ugt_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14611,7 +14723,7 @@ ; AVX512F-LABEL: ult_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14635,7 +14747,7 @@ ; ; AVX512BW-LABEL: ult_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14674,7 +14786,7 @@ ; AVX512F-LABEL: ugt_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14698,7 +14810,7 @@ ; ; AVX512BW-LABEL: ugt_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14737,7 +14849,7 @@ ; AVX512F-LABEL: ult_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14761,7 +14873,7 @@ ; ; AVX512BW-LABEL: ult_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14800,7 +14912,7 @@ ; AVX512F-LABEL: ugt_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14824,7 +14936,7 @@ ; ; AVX512BW-LABEL: ugt_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14863,7 +14975,7 @@ ; AVX512F-LABEL: ult_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14887,7 +14999,7 @@ ; ; AVX512BW-LABEL: ult_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14926,7 +15038,7 @@ ; AVX512F-LABEL: ugt_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14950,7 +15062,7 @@ ; ; AVX512BW-LABEL: ugt_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -14989,7 +15101,7 @@ ; AVX512F-LABEL: ult_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15013,7 +15125,7 @@ ; ; AVX512BW-LABEL: ult_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15052,7 +15164,7 @@ ; AVX512F-LABEL: ugt_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15076,7 +15188,7 @@ ; ; AVX512BW-LABEL: ugt_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15115,7 +15227,7 @@ ; AVX512F-LABEL: ult_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15139,7 +15251,7 @@ ; ; AVX512BW-LABEL: ult_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15178,7 +15290,7 @@ ; AVX512F-LABEL: ugt_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15202,7 +15314,7 @@ ; ; AVX512BW-LABEL: ugt_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15241,7 +15353,7 @@ ; AVX512F-LABEL: ult_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15265,7 +15377,7 @@ ; ; AVX512BW-LABEL: ult_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15304,7 +15416,7 @@ ; AVX512F-LABEL: ugt_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15328,7 +15440,7 @@ ; ; AVX512BW-LABEL: ugt_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15367,7 +15479,7 @@ ; AVX512F-LABEL: ult_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15391,7 +15503,7 @@ ; ; AVX512BW-LABEL: ult_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15430,7 +15542,7 @@ ; AVX512F-LABEL: ugt_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15454,7 +15566,7 @@ ; ; AVX512BW-LABEL: ugt_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15493,7 +15605,7 @@ ; AVX512F-LABEL: ult_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15517,7 +15629,7 @@ ; ; AVX512BW-LABEL: ult_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15556,7 +15668,7 @@ ; AVX512F-LABEL: ugt_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15580,7 +15692,7 @@ ; ; AVX512BW-LABEL: ugt_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15619,7 +15731,7 @@ ; AVX512F-LABEL: ult_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15643,7 +15755,7 @@ ; ; AVX512BW-LABEL: ult_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15682,7 +15794,7 @@ ; AVX512F-LABEL: ugt_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15706,7 +15818,7 @@ ; ; AVX512BW-LABEL: ugt_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15745,7 +15857,7 @@ ; AVX512F-LABEL: ult_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15769,7 +15881,7 @@ ; ; AVX512BW-LABEL: ult_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15808,7 +15920,7 @@ ; AVX512F-LABEL: ugt_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15832,7 +15944,7 @@ ; ; AVX512BW-LABEL: ugt_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -15871,7 +15983,7 @@ ; AVX512F-LABEL: ult_63_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15895,7 +16007,7 @@ ; ; AVX512BW-LABEL: ult_63_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -9,7 +9,7 @@ ; AVX512F-LABEL: testv8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -31,7 +31,7 @@ ; ; AVX512BW-LABEL: testv8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -62,7 +62,7 @@ ; AVX512F-LABEL: testv16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -92,7 +92,7 @@ ; ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -130,7 +130,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -156,7 +156,7 @@ ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -183,7 +183,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -208,7 +208,7 @@ ; AVX512F-LABEL: testv64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -227,7 +227,7 @@ ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -240,7 +240,7 @@ ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -259,7 +259,7 @@ ; ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -501,7 +501,8 @@ ; BITALG-LABEL: eq_1_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -572,7 +573,8 @@ ; BITALG-LABEL: ne_1_v32i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2w %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %0) @@ -639,7 +641,8 @@ ; BITALG-LABEL: eq_1_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) @@ -710,7 +713,8 @@ ; BITALG-LABEL: ne_1_v64i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; BITALG-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; BITALG-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 ; BITALG-NEXT: vpmovm2b %k0, %zmm0 ; BITALG-NEXT: retq %2 = tail call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %0) diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -276,7 +276,8 @@ ; AVX512-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1216,7 +1217,7 @@ ; ; AVX2-LABEL: test_v64i16_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -1243,7 +1244,8 @@ ; AVX512-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -836,11 +836,25 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] +; AVX2-NEXT: vptest %xmm1, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] +; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0) %2 = trunc i64 %1 to i16 %3 = icmp eq i16 %2, 0 @@ -1028,12 +1042,28 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: PR44781: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1-LABEL: PR44781: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1-NEXT: sete %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR44781: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455] +; AVX2-NEXT: vptest %xmm1, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR44781: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455] +; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: retq %2 = bitcast %struct.Box* %0 to <4 x i32>* %3 = load <4 x i32>, <4 x i32>* %2, align 4 %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %3) diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -1199,18 +1199,29 @@ ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq ; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1261,7 +1272,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1273,7 +1285,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1333,7 +1346,8 @@ ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1347,7 +1361,8 @@ ; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1423,7 +1438,8 @@ ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1438,7 +1454,8 @@ ; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF @@ -1664,20 +1681,33 @@ ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq ; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorb $127, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorb $127, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorb $127, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1758,7 +1788,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1772,7 +1803,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1870,7 +1902,8 @@ ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1886,7 +1919,8 @@ ; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -2016,7 +2050,8 @@ ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -2033,7 +2068,8 @@ ; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -1199,18 +1199,29 @@ ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq ; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1261,7 +1272,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1273,7 +1285,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1333,7 +1346,8 @@ ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1347,7 +1361,8 @@ ; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1423,7 +1438,8 @@ ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1438,7 +1454,8 @@ ; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 @@ -1664,20 +1681,33 @@ ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq ; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: addb $-128, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: addb $-128, %al +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: addb $-128, %al +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1758,7 +1788,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1772,7 +1803,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1870,7 +1902,8 @@ ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -1886,7 +1919,8 @@ ; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -2016,7 +2050,8 @@ ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vphminposuw %xmm0, %xmm0 @@ -2033,7 +2068,8 @@ ; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vphminposuw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -63,16 +63,27 @@ ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -63,16 +63,27 @@ ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -73,7 +73,7 @@ ; ; AVX2-LABEL: var_rotate_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 @@ -352,8 +352,9 @@ ; ; AVX2-LABEL: var_rotate_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -370,11 +371,12 @@ ; ; AVX512F-LABEL: var_rotate_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -386,11 +388,12 @@ ; ; AVX512VL-LABEL: var_rotate_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -402,9 +405,10 @@ ; AVX512BW-LABEL: var_rotate_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -413,9 +417,10 @@ ; ; AVX512VLBW-LABEL: var_rotate_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -549,36 +554,66 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_rotate_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_rotate_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_rotate_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $6, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_rotate_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -591,7 +626,8 @@ ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsrld $8, %zmm0, %zmm0 @@ -601,7 +637,8 @@ ; ; AVX512BW-LABEL: var_rotate_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -614,7 +651,8 @@ ; ; AVX512VLBW-LABEL: var_rotate_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] @@ -629,7 +667,8 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -643,7 +682,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpermb %ymm0, %ymm2, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -720,14 +760,23 @@ ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_rotate_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_rotate_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_rotate_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_rotate_v2i64: ; AVX512F: # %bb.0: @@ -1761,14 +1810,25 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_rotate_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_rotate_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_rotate_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_rotate_v16i8: ; AVX512F: # %bb.0: @@ -1783,7 +1843,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v16i8: @@ -1799,7 +1859,7 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_v16i8: @@ -1815,7 +1875,7 @@ ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v16i8: @@ -1972,63 +2032,85 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_rotate_mask_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1 -; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_rotate_mask_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_rotate_mask_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; AVX2-NEXT: vpsrlw $11, %xmm0, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm2 ; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 -; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $200, %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm2 +; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $200, %xmm2, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] ; AVX512VLVBMI2-NEXT: vpshldw $5, %xmm0, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatconstant_rotate_mask_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vprotw $5, %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [55,55,55,55,55,55,55,55] +; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_rotate_mask_v8i16: ; X86-SSE2: # %bb.0: @@ -2058,22 +2140,36 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_rotate_mask_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_rotate_mask_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_rotate_mask_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_rotate_mask_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2081,8 +2177,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8: @@ -2090,7 +2187,8 @@ ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2098,8 +2196,9 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLBW-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i8: @@ -2107,7 +2206,8 @@ ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VBMI2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq ; @@ -2115,15 +2215,23 @@ ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLVBMI2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatconstant_rotate_mask_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vprotb $4, %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; XOPAVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_rotate_mask_v16i8: ; X86-SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -265,7 +265,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 @@ -274,7 +275,7 @@ ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 @@ -288,11 +289,12 @@ ; ; AVX512F-LABEL: var_rotate_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -302,11 +304,12 @@ ; ; AVX512VL-LABEL: var_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -317,9 +320,10 @@ ; AVX512BW-LABEL: var_rotate_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -327,9 +331,10 @@ ; ; AVX512VLBW-LABEL: var_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -426,21 +431,26 @@ ; AVX2-LABEL: var_rotate_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -460,7 +470,8 @@ ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -481,14 +492,16 @@ ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpternlogq $234, %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -499,7 +512,8 @@ ; ; AVX512VLBW-LABEL: var_rotate_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30,32,32,34,34,36,36,38,38,40,40,42,42,44,44,46,46,48,48,50,50,52,52,54,54,56,56,58,58,60,60,62,62] @@ -513,7 +527,8 @@ ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -525,7 +540,8 @@ ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -578,7 +594,7 @@ ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -1578,9 +1594,11 @@ ; AVX2-LABEL: splatconstant_rotate_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1800,53 +1818,60 @@ ; ; AVX2-LABEL: splatconstant_rotate_mask_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm2 ; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $200, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm2 +; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $200, %ymm2, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VLVBMI2-NEXT: vpshldw $5, %ymm0, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: @@ -1860,11 +1885,12 @@ ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <16 x i16> %a, %lshr = lshr <16 x i16> %a, @@ -1896,11 +1922,14 @@ ; AVX2-LABEL: splatconstant_rotate_mask_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatconstant_rotate_mask_v32i8: @@ -1908,7 +1937,8 @@ ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8: @@ -1916,7 +1946,8 @@ ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8: @@ -1924,7 +1955,8 @@ ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8: @@ -1932,7 +1964,8 @@ ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i8: @@ -1940,7 +1973,8 @@ ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VBMI2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i8: @@ -1948,7 +1982,8 @@ ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLVBMI2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: @@ -1966,7 +2001,8 @@ ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -37,12 +37,12 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: var_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -64,12 +64,12 @@ ; ; AVX512VL-LABEL: var_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -91,9 +91,10 @@ ; ; AVX512BW-LABEL: var_rotate_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -101,9 +102,10 @@ ; ; AVX512VLBW-LABEL: var_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -143,7 +145,7 @@ ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 @@ -186,7 +188,7 @@ ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 @@ -210,7 +212,8 @@ ; ; AVX512BW-LABEL: var_rotate_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -225,7 +228,8 @@ ; ; AVX512VLBW-LABEL: var_rotate_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -240,7 +244,8 @@ ; ; AVX512VBMI2-LABEL: var_rotate_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -255,7 +260,8 @@ ; ; AVX512VLVBMI2-LABEL: var_rotate_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] @@ -833,52 +839,60 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $11, %ymm3, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $200, %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm3 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $11, %ymm3, %ymm4 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogq $200, %zmm2, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $200, %zmm2, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] +; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $200, %zmm2, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastw {{.*#+}} zmm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw {{.*#+}} zmm1 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, @@ -899,7 +913,9 @@ ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: @@ -912,7 +928,9 @@ ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: @@ -920,7 +938,8 @@ ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: @@ -928,7 +947,8 @@ ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8: @@ -936,7 +956,8 @@ ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpbroadcastb {{.*#+}} zmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VBMI2-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8: @@ -944,7 +965,8 @@ ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb {{.*#+}} zmm1 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VLVBMI2-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -66,7 +66,7 @@ ; ; AVX2-LABEL: var_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -632,14 +632,23 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -837,7 +846,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -933,15 +942,26 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_modulo_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_modulo_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -954,7 +974,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 @@ -964,7 +985,8 @@ ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -972,7 +994,7 @@ ; ; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1130,7 +1152,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1147,7 +1169,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -1156,9 +1179,10 @@ ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1166,9 +1190,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1177,9 +1202,10 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1187,9 +1213,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper @@ -1601,10 +1628,16 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX2-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatconstant_shift_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551609,18446744073709551609] +; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i64: ; AVX512: # %bb.0: @@ -1710,25 +1743,42 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1736,9 +1786,10 @@ ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: @@ -1764,15 +1815,25 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: PR52719: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR52719: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR52719: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: PR52719: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -888,7 +888,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -925,7 +925,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -943,7 +943,7 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -990,7 +990,7 @@ ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1022,7 +1022,8 @@ ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -1044,7 +1045,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -1055,14 +1057,15 @@ ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1252,7 +1255,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1272,7 +1275,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -1291,7 +1295,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1299,9 +1303,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1310,7 +1315,7 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -1322,9 +1327,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq @@ -1358,7 +1364,7 @@ ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1958,8 +1964,9 @@ ; AVX2-LABEL: splatconstant_shift_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1976,8 +1983,9 @@ ; XOPAVX2-LABEL: splatconstant_shift_v32i8: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1985,8 +1993,9 @@ ; AVX512-LABEL: splatconstant_shift_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -1994,9 +2003,10 @@ ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: @@ -2018,8 +2028,9 @@ ; X86-AVX2-LABEL: splatconstant_shift_v32i8: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -197,7 +197,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -212,7 +212,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -233,7 +233,8 @@ define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, @@ -286,7 +287,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -301,7 +302,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -449,9 +450,9 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 @@ -464,9 +465,10 @@ ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpternlogq $108, %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1356,7 +1356,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1494,7 +1494,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1632,7 +1632,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -2312,25 +2312,42 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -2338,9 +2355,10 @@ ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: @@ -2365,25 +2383,42 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -2391,9 +2426,10 @@ ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: @@ -2418,25 +2454,42 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v2i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v2i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -2444,9 +2497,10 @@ ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogq $108, %xmm0, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsubb %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -416,21 +416,40 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v16i8: ; XOP: # %bb.0: @@ -775,27 +794,42 @@ ; SSE-NEXT: psrlq %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_modulo_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_modulo_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatvar_modulo_shift_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatvar_modulo_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -950,7 +984,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -959,9 +994,10 @@ ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -969,9 +1005,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -980,9 +1017,10 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -990,9 +1028,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper @@ -1447,27 +1486,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -380,16 +380,19 @@ ; ; AVX2-LABEL: var_shift_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -420,16 +423,19 @@ ; ; AVX512DQ-LABEL: var_shift_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -444,16 +450,19 @@ ; ; AVX512DQVL-LABEL: var_shift_v32i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $2, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $1, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq @@ -502,16 +511,19 @@ ; ; X86-AVX2-LABEL: var_shift_v32i8: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -832,7 +844,8 @@ ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -847,19 +860,21 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1049,7 +1064,8 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 @@ -1072,9 +1088,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1092,9 +1109,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq @@ -1653,7 +1671,8 @@ ; AVX2-LABEL: splatconstant_shift_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_shift_v32i8: @@ -1668,19 +1687,22 @@ ; XOPAVX2-LABEL: splatconstant_shift_v32i8: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: @@ -1697,7 +1719,8 @@ ; X86-AVX2-LABEL: splatconstant_shift_v32i8: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, ret <32 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -53,18 +53,18 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 @@ -86,17 +86,20 @@ ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} @@ -188,7 +191,8 @@ define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, @@ -389,17 +393,20 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -521,21 +521,40 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v8i8: ; XOP: # %bb.0: @@ -671,21 +690,40 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v4i8: ; XOP: # %bb.0: @@ -821,21 +859,40 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v2i8: ; XOP: # %bb.0: @@ -2022,27 +2079,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: @@ -2061,27 +2133,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: @@ -2100,27 +2187,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v2i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v2i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -331,20 +331,37 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v16i8: ; XOP: # %bb.0: @@ -682,27 +699,42 @@ ; SSE-NEXT: psllq %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_modulo_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_modulo_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatvar_modulo_shift_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatvar_modulo_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_modulo_shift_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -856,16 +888,18 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -873,9 +907,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -884,9 +919,10 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -894,9 +930,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper @@ -1133,7 +1170,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1307,27 +1345,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -323,12 +323,14 @@ ; ; AVX2-LABEL: var_shift_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -356,12 +358,14 @@ ; ; AVX512DQ-LABEL: var_shift_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -379,12 +383,14 @@ ; ; AVX512DQVL-LABEL: var_shift_v32i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -433,12 +439,14 @@ ; ; X86-AVX2-LABEL: var_shift_v32i8: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -757,7 +765,8 @@ ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -772,19 +781,21 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_modulo_shift_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -973,9 +984,10 @@ ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -993,9 +1005,10 @@ ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1012,9 +1025,10 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq @@ -1261,7 +1275,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1291,7 +1305,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1310,7 +1324,7 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQVL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1352,7 +1366,7 @@ ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -1541,7 +1555,8 @@ ; AVX2-LABEL: splatconstant_shift_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_shift_v32i8: @@ -1556,19 +1571,22 @@ ; XOPAVX2-LABEL: splatconstant_shift_v32i8: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsllw $3, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; XOPAVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: @@ -1585,7 +1603,8 @@ ; X86-AVX2-LABEL: splatconstant_shift_v32i8: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl %shift = shl <32 x i8> %a, ret <32 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -53,13 +53,13 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 @@ -83,12 +83,14 @@ ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} @@ -181,7 +183,8 @@ define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, @@ -301,7 +304,7 @@ ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] @@ -323,7 +326,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 @@ -376,17 +379,20 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -390,20 +390,37 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v8i8: ; XOP: # %bb.0: @@ -534,20 +551,37 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v4i8: ; XOP: # %bb.0: @@ -678,20 +712,37 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: var_shift_v2i8: ; XOP: # %bb.0: @@ -1461,7 +1512,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1550,7 +1602,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1639,7 +1692,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1809,27 +1863,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v8i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v8i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: @@ -1848,27 +1917,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v4i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v4i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: @@ -1887,27 +1971,42 @@ ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; AVX2-LABEL: splatconstant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: splatconstant_shift_v2i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatconstant_shift_v2i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -910,7 +910,7 @@ ; ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -924,10 +924,18 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; XOP-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; XOP: # %bb.0: -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] -; XOP-NEXT: retq +; XOPAVX1-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -2103,7 +2111,7 @@ ; ; AVX2-LABEL: PR12412: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2117,10 +2125,18 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; XOP-LABEL: PR12412: -; XOP: # %bb.0: # %entry -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] -; XOP-NEXT: retq +; XOPAVX1-LABEL: PR12412: +; XOPAVX1: # %bb.0: # %entry +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: PR12412: +; XOPAVX2: # %bb.0: # %entry +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq entry: %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> ret <16 x i8> %0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6840,7 +6840,7 @@ ; ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX512VL-FAST-CROSSLANE: # %bb.0: -; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-CROSSLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1678,7 +1678,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2OR512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: @@ -1692,7 +1693,8 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -4527,7 +4529,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2OR512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2OR512VL-NEXT: retq ; @@ -4539,7 +4542,8 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; XOPAVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -4575,7 +4579,7 @@ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI-FAST-ALL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -4613,15 +4617,17 @@ ; ; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] ; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VLBW-NEXT: retq ; @@ -4644,10 +4650,27 @@ } define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { -; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; ALL: # %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -4662,7 +4685,8 @@ ; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX2OR512VL: # %bb.0: ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2OR512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: @@ -4674,7 +4698,8 @@ ; XOPAVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; XOPAVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; XOPAVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -4841,7 +4866,7 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -29,7 +29,7 @@ ; ; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -763,11 +763,21 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: constant_fold_pshufb_2: -; AVX: # %bb.0: -; AVX-NEXT: movl $2, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: constant_fold_pshufb_2: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $2, %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_fold_pshufb_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: constant_fold_pshufb_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 } diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -107,16 +107,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -257,17 +268,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -1115,7 +1138,7 @@ ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1127,7 +1150,7 @@ ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1293,7 +1316,7 @@ ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1306,7 +1329,7 @@ ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2801,16 +2824,27 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2955,17 +2989,29 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -113,16 +113,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -259,17 +270,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -1139,10 +1162,10 @@ ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1151,10 +1174,10 @@ ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -1311,10 +1334,10 @@ ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1324,10 +1347,10 @@ ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2566,16 +2589,27 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2715,17 +2749,29 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -75,15 +75,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -180,16 +192,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -793,9 +818,11 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] -; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -804,9 +831,11 @@ ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] -; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -922,9 +951,11 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] -; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -934,9 +965,11 @@ ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] -; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2098,15 +2131,27 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2206,16 +2251,29 @@ ; SSE41-NEXT: pextrw $0, %xmm2, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -4314,21 +4372,30 @@ ; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v8i16_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v8i16_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v8i16_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -4381,23 +4448,33 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v8i16_v8i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v8i16_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i16_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v8i16_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_usat_v8i16_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq @@ -4471,7 +4548,8 @@ ; ; AVX2-LABEL: trunc_usat_v16i16_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -4479,7 +4557,8 @@ ; ; AVX512F-LABEL: trunc_usat_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -4487,7 +4566,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v16i16_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -4593,7 +4673,7 @@ ; ; AVX2-LABEL: trunc_usat_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -4602,7 +4682,7 @@ ; ; AVX512F-LABEL: trunc_usat_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -4614,7 +4694,7 @@ ; ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512VL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1768,17 +1768,25 @@ ; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc2x8i16_16i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x8i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc2x8i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1786,7 +1794,7 @@ ; ; AVX512VL-LABEL: trunc2x8i16_16i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -122,7 +122,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -140,7 +140,7 @@ ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; @@ -150,7 +150,7 @@ ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -323,7 +323,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -341,7 +341,7 @@ ; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; @@ -351,7 +351,7 @@ ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -548,7 +548,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -789,7 +789,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -992,23 +992,77 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1168,23 +1222,77 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1330,20 +1438,65 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1485,20 +1638,65 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -45,7 +45,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -119,7 +119,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -168,7 +168,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -242,7 +242,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -299,7 +299,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -385,7 +385,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -446,7 +446,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -532,7 +532,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -588,7 +588,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -606,7 +606,7 @@ ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -624,7 +624,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -679,7 +679,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -732,7 +732,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -750,7 +750,7 @@ ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -768,7 +768,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -823,7 +823,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -870,7 +870,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -885,7 +885,7 @@ ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -900,7 +900,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -915,7 +915,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -930,7 +930,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -962,7 +962,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1006,7 +1006,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1021,7 +1021,7 @@ ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1036,7 +1036,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1051,7 +1051,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1066,7 +1066,7 @@ ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1098,7 +1098,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -31,7 +31,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -90,7 +90,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -149,7 +149,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -216,7 +216,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -263,7 +263,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -294,7 +294,7 @@ ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -312,7 +312,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -359,7 +359,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -390,7 +390,7 @@ ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -408,7 +408,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -456,7 +456,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -480,7 +480,7 @@ ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -495,7 +495,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -511,7 +511,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -548,7 +548,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -572,7 +572,7 @@ ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -587,7 +587,7 @@ ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 @@ -603,7 +603,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -353,15 +353,25 @@ ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ugt_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ugt_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ugt_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ugt <16 x i8> %sh1, %sh2 @@ -380,15 +390,25 @@ ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ult_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ult_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ult_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ult <16 x i8> %sh1, %sh2 @@ -407,16 +427,27 @@ ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: uge_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: uge_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uge_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp uge <16 x i8> %sh1, %sh2 @@ -435,16 +466,27 @@ ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ule_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ule_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ule_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ule <16 x i8> %sh1, %sh2 diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2529,7 +2529,8 @@ ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero @@ -2726,13 +2727,15 @@ ; ; AVX2-LABEL: splatshuf_zext_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: retq ; ; AVX512-LABEL: splatshuf_zext_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512-NEXT: retq %shuf = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -22,12 +22,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_lshr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_lshr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_lshr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -36,12 +45,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -54,11 +72,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -66,11 +91,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -82,11 +114,18 @@ ; X86-SSE2-NEXT: psrlw $2, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_2: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_2: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_2: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_lshr_2: ; X64-SSE2: # %bb.0: @@ -94,11 +133,18 @@ ; X64-SSE2-NEXT: psrlw $2, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_2: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_2: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_2: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -111,12 +157,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_lshr_3: ; X64-SSE2: # %bb.0: @@ -125,12 +180,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -143,12 +207,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_lshr_4: ; X64-SSE2: # %bb.0: @@ -157,12 +230,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_lshr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_lshr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_lshr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -175,11 +257,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -187,11 +276,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -203,11 +299,18 @@ ; X86-SSE2-NEXT: psrlw $4, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_lshr_4: ; X64-SSE2: # %bb.0: @@ -215,11 +318,18 @@ ; X64-SSE2-NEXT: psrlw $4, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -231,11 +341,18 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_lshr_5: ; X64-SSE2: # %bb.0: @@ -243,11 +360,18 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -259,11 +383,18 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_lshr_6: ; X64-SSE2: # %bb.0: @@ -271,11 +402,18 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_lshr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_lshr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_lshr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = lshr <16 x i8> %t0, ret <16 x i8> %t1 @@ -291,12 +429,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -305,12 +452,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -323,11 +479,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -335,11 +498,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -351,11 +521,18 @@ ; X86-SSE2-NEXT: psrlw $2, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_2: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_2: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_2: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_2: ; X64-SSE2: # %bb.0: @@ -363,11 +540,18 @@ ; X64-SSE2-NEXT: psrlw $2, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_2: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_2: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_2: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -380,12 +564,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_3: ; X64-SSE2: # %bb.0: @@ -394,12 +587,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -412,12 +614,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_ashr_4: ; X64-SSE2: # %bb.0: @@ -426,12 +637,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_ashr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_ashr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_ashr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -447,14 +667,24 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -465,14 +695,24 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -487,14 +727,24 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X64-SSE2: # %bb.0: @@ -505,14 +755,24 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -527,14 +787,24 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X64-SSE2: # %bb.0: @@ -545,14 +815,24 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -567,14 +847,24 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X64-SSE2: # %bb.0: @@ -585,14 +875,24 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -607,11 +907,18 @@ ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_shl_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -619,11 +926,18 @@ ; X64-SSE2-NEXT: paddb %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -635,11 +949,18 @@ ; X86-SSE2-NEXT: psllw $4, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_shl_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_shl_4: ; X64-SSE2: # %bb.0: @@ -647,11 +968,18 @@ ; X64-SSE2-NEXT: psllw $4, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_shl_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -663,11 +991,18 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_shl_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsllw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_shl_5: ; X64-SSE2: # %bb.0: @@ -675,11 +1010,18 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_shl_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -691,11 +1033,18 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_7_mask_shl_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsllw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_7_mask_shl_6: ; X64-SSE2: # %bb.0: @@ -703,11 +1052,18 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_7_mask_shl_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsllw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_7_mask_shl_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_7_mask_shl_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -720,11 +1076,18 @@ ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_shl_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -732,11 +1095,18 @@ ; X64-SSE2-NEXT: paddb %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -748,11 +1118,18 @@ ; X86-SSE2-NEXT: psllw $2, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_shl_2: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $2, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_2: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $2, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_2: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $2, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_shl_2: ; X64-SSE2: # %bb.0: @@ -760,11 +1137,18 @@ ; X64-SSE2-NEXT: psllw $2, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_shl_2: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $2, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_2: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $2, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_2: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -776,11 +1160,18 @@ ; X86-SSE2-NEXT: psllw $3, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_shl_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_shl_3: ; X64-SSE2: # %bb.0: @@ -788,11 +1179,18 @@ ; X64-SSE2-NEXT: psllw $3, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_shl_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -805,12 +1203,21 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_28_mask_shl_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_28_mask_shl_4: ; X64-SSE2: # %bb.0: @@ -819,12 +1226,21 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_28_mask_shl_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_28_mask_shl_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_28_mask_shl_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -837,11 +1253,18 @@ ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_shl_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_shl_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_shl_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -849,11 +1272,18 @@ ; X64-SSE2-NEXT: paddb %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = shl <16 x i8> %t0, ret <16 x i8> %t1 @@ -872,11 +1302,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_127_mask_lshr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_127_mask_lshr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_127_mask_lshr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -884,11 +1321,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_127_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_127_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_127_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -901,11 +1345,18 @@ ; X86-SSE2-NEXT: psrlw $3, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_lshr_3: ; X64-SSE2: # %bb.0: @@ -913,11 +1364,18 @@ ; X64-SSE2-NEXT: psrlw $3, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -929,11 +1387,18 @@ ; X86-SSE2-NEXT: psrlw $4, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_lshr_4: ; X64-SSE2: # %bb.0: @@ -941,11 +1406,18 @@ ; X64-SSE2-NEXT: psrlw $4, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -957,11 +1429,18 @@ ; X86-SSE2-NEXT: psrlw $5, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_lshr_5: ; X64-SSE2: # %bb.0: @@ -969,11 +1448,18 @@ ; X64-SSE2-NEXT: psrlw $5, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -985,11 +1471,18 @@ ; X86-SSE2-NEXT: psrlw $6, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_lshr_6: ; X64-SSE2: # %bb.0: @@ -997,11 +1490,18 @@ ; X64-SSE2-NEXT: psrlw $6, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_lshr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_lshr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_lshr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1014,11 +1514,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_lshr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_65024_mask_lshr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_65024_mask_lshr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -1026,11 +1533,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_65024_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_65024_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1042,11 +1556,18 @@ ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_lshr_8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_65024_mask_lshr_8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_65024_mask_lshr_8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_lshr_8: ; X64-SSE2: # %bb.0: @@ -1054,11 +1575,18 @@ ; X64-SSE2-NEXT: psrlw $8, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_lshr_8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_65024_mask_lshr_8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_65024_mask_lshr_8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = lshr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1121,11 +1649,18 @@ ; X86-SSE2-NEXT: psrlw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_127_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_127_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_127_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -1133,11 +1668,18 @@ ; X64-SSE2-NEXT: psrlw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_127_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_127_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_127_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1150,11 +1692,18 @@ ; X86-SSE2-NEXT: psrlw $3, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_ashr_3: ; X64-SSE2: # %bb.0: @@ -1162,11 +1711,18 @@ ; X64-SSE2-NEXT: psrlw $3, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1178,11 +1734,18 @@ ; X86-SSE2-NEXT: psrlw $4, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_ashr_4: ; X64-SSE2: # %bb.0: @@ -1190,11 +1753,18 @@ ; X64-SSE2-NEXT: psrlw $4, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1206,11 +1776,18 @@ ; X86-SSE2-NEXT: psrlw $5, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_ashr_5: ; X64-SSE2: # %bb.0: @@ -1218,11 +1795,18 @@ ; X64-SSE2-NEXT: psrlw $5, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1234,11 +1818,18 @@ ; X86-SSE2-NEXT: psrlw $6, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_ashr_6: ; X64-SSE2: # %bb.0: @@ -1246,11 +1837,18 @@ ; X64-SSE2-NEXT: psrlw $6, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_ashr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_ashr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_ashr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1263,11 +1861,18 @@ ; X86-SSE2-NEXT: psraw $1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsraw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_65024_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_65024_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -1275,11 +1880,18 @@ ; X64-SSE2-NEXT: psraw $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsraw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_65024_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_65024_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1291,11 +1903,18 @@ ; X86-SSE2-NEXT: psraw $8, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_ashr_8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsraw $8, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_65024_mask_ashr_8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_65024_mask_ashr_8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_ashr_8: ; X64-SSE2: # %bb.0: @@ -1303,11 +1922,18 @@ ; X64-SSE2-NEXT: psraw $8, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_ashr_8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsraw $8, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_65024_mask_ashr_8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_65024_mask_ashr_8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = ashr <8 x i16> %t0, ret <8 x i16> %t1 @@ -1370,11 +1996,18 @@ ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_127_mask_shl_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_127_mask_shl_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -1382,11 +2015,18 @@ ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_127_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_127_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1398,11 +2038,18 @@ ; X86-SSE2-NEXT: psllw $8, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_127_mask_shl_8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_127_mask_shl_8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_8: ; X64-SSE2: # %bb.0: @@ -1410,11 +2057,18 @@ ; X64-SSE2-NEXT: psllw $8, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $8, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_127_mask_shl_8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_127_mask_shl_8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1475,11 +2129,18 @@ ; X86-SSE2-NEXT: psllw $3, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_3: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_shl_3: ; X64-SSE2: # %bb.0: @@ -1487,11 +2148,18 @@ ; X64-SSE2-NEXT: psllw $3, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_3: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $3, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1503,11 +2171,18 @@ ; X86-SSE2-NEXT: psllw $4, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_shl_4: ; X64-SSE2: # %bb.0: @@ -1515,11 +2190,18 @@ ; X64-SSE2-NEXT: psllw $4, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1531,11 +2213,18 @@ ; X86-SSE2-NEXT: psllw $5, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_shl_5: ; X64-SSE2: # %bb.0: @@ -1543,11 +2232,18 @@ ; X64-SSE2-NEXT: psllw $5, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1559,11 +2255,18 @@ ; X86-SSE2-NEXT: psllw $6, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsllw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_2032_mask_shl_6: ; X64-SSE2: # %bb.0: @@ -1571,11 +2274,18 @@ ; X64-SSE2-NEXT: psllw $6, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_2032_mask_shl_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_2032_mask_shl_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_2032_mask_shl_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2032,2032,2032,2032,2032,2032,2032,2032] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -1588,11 +2298,18 @@ ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i16_x_8_65024_mask_shl_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i16_x_8_65024_mask_shl_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -1600,11 +2317,18 @@ ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i16_x_8_65024_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i16_x_8_65024_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [65024,65024,65024,65024,65024,65024,65024,65024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, ret <8 x i16> %t1 @@ -2680,11 +3404,18 @@ ; X64-SSE2-NEXT: psrlq $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_2147483647_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_2147483647_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2709,11 +3440,18 @@ ; X64-SSE2-NEXT: psrlq $15, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_15: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $15, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_15: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $15, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_15: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $15, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2737,11 +3475,18 @@ ; X64-SSE2-NEXT: psrlq $16, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $16, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2765,11 +3510,18 @@ ; X64-SSE2-NEXT: psrlq $17, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_17: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $17, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_17: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $17, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_17: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $17, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2793,11 +3545,18 @@ ; X64-SSE2-NEXT: psrlq $18, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_18: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $18, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_18: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $18, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_18: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $18, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2822,11 +3581,18 @@ ; X64-SSE2-NEXT: psrlq $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744065119617024,18446744065119617024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2857,11 +3623,18 @@ ; X64-SSE2-NEXT: psrlq $32, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744065119617024,18446744065119617024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = lshr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2936,11 +3709,18 @@ ; X64-SSE2-NEXT: psrlq $1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_2147483647_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_2147483647_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2965,11 +3745,18 @@ ; X64-SSE2-NEXT: psrlq $15, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $15, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $15, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $15, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1 @@ -2993,11 +3780,18 @@ ; X64-SSE2-NEXT: psrlq $16, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $16, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1 @@ -3021,11 +3815,18 @@ ; X64-SSE2-NEXT: psrlq $17, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_17: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $17, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_17: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $17, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_17: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $17, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1 @@ -3049,11 +3850,18 @@ ; X64-SSE2-NEXT: psrlq $18, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_18: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlq $18, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_18: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlq $18, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_18: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlq $18, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1 @@ -3099,7 +3907,8 @@ ; ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744065119617024,18446744065119617024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] @@ -3155,7 +3964,8 @@ ; ; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744065119617024,18446744065119617024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -3298,11 +4108,18 @@ ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3333,11 +4150,18 @@ ; X64-SSE2-NEXT: psllq $32, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_2147483647_mask_shl_32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_2147483647_mask_shl_32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3410,11 +4234,18 @@ ; X64-SSE2-NEXT: psllq $15, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_shl_15: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllq $15, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_shl_15: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllq $15, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_shl_15: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllq $15, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3438,11 +4269,18 @@ ; X64-SSE2-NEXT: psllq $16, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_shl_16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllq $16, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_shl_16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllq $16, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_shl_16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllq $16, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3466,11 +4304,18 @@ ; X64-SSE2-NEXT: psllq $17, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_shl_17: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllq $17, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_shl_17: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllq $17, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_shl_17: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllq $17, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3494,11 +4339,18 @@ ; X64-SSE2-NEXT: psllq $18, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_140737488289792_mask_shl_18: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsllq $18, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_140737488289792_mask_shl_18: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsllq $18, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_140737488289792_mask_shl_18: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [140737488289792,140737488289792] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllq $18, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 @@ -3523,11 +4375,18 @@ ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744065119617024,18446744065119617024] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, ret <2 x i64> %t1 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -148,10 +148,12 @@ ; AVX2-LABEL: PR22706: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %tmp = select <32 x i1> %x, <32 x i8> , <32 x i8> ret <32 x i8> %tmp diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -9549,7 +9549,7 @@ ; ; AVX2-LABEL: test181: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9617,7 +9617,7 @@ ; ; AVX2-LABEL: test182: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9685,7 +9685,7 @@ ; ; AVX2-LABEL: test183: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9753,7 +9753,7 @@ ; ; AVX2-LABEL: test184: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10055,7 +10055,7 @@ ; ; AVX2-LABEL: test189: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10123,7 +10123,7 @@ ; ; AVX2-LABEL: test190: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10191,7 +10191,7 @@ ; ; AVX2-LABEL: test191: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10259,7 +10259,7 @@ ; ; AVX2-LABEL: test192: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -583,7 +583,8 @@ ; ; AVX512-LABEL: blend_splat1_mask_cond_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 @@ -614,7 +615,8 @@ ; ; AVX512-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 @@ -644,8 +646,7 @@ ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 +; AVX512F-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -653,7 +654,7 @@ ; ; AVX512VL-LABEL: blend_splatmax_mask_cond_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; AVX512VL-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 ; AVX512VL-NEXT: vpblendmq %xmm1, %xmm2, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; @@ -711,7 +712,8 @@ ; ; AVX512-LABEL: blend_splatmax_mask_cond_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 @@ -750,7 +752,8 @@ ; ; AVX512-LABEL: blend_splatmax_mask_cond_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 @@ -880,7 +883,8 @@ ; ; AVX512-LABEL: blend_splat_mask_cond_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024] +; AVX512-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 @@ -911,7 +915,8 @@ ; ; AVX512-LABEL: blend_splat_mask_cond_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -655,15 +655,6 @@ ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: shlq $15, %rax ; SSE-NEXT: retq -; -; AVX-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: andl $1, %eax -; AVX-NEXT: shlq $15, %rax -; AVX-NEXT: retq 0: %1 = load <8 x i8>, <8 x i8>* %x %2 = icmp eq <8 x i8> %1, diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -522,8 +522,9 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: @@ -756,8 +757,9 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: