diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4653,6 +4653,15 @@ return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()); + case ISD::SRA: + case ISD::SRL: + // If the max shift amount isn't in range, then the shift can create poison. + if (!getValidMaximumShiftAmountConstant(Op, DemandedElts)) + return true; + + // Matches hasPoisonGeneratingFlags(). + return ConsiderFlags && Op->getFlags().hasExact(); + default: // Allow the target to implement this method for its nodes. if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -973,16 +973,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] ; GCN-IR-NEXT: s_mov_b32 s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1016,17 +1016,17 @@ ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 +; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s1, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_mov_b32 s7, s6 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_sub_u32 s8, s6, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 +; GCN-IR-NEXT: s_sub_u32 s8, s8, s6 +; GCN-IR-NEXT: s_subb_u32 s9, s9, s6 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] @@ -1160,16 +1160,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -792,8 +792,6 @@ ; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -252,15 +252,17 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: .cfi_offset ra, -8 -; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 +; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: slli a1, a0, 33 +; RV64I-NEXT: srli a1, a1, 41 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 +; RV64I-NEXT: slli a1, a0, 33 +; RV64I-NEXT: srli a1, a1, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: srli a1, a0, 1 diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -412,15 +412,13 @@ ; X86-LABEL: freeze_ashr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $3, %eax +; X86-NEXT: sarl $6, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $3, %eax +; X64-NEXT: sarl $6, %eax ; X64-NEXT: retq %x = ashr i32 %a0, 3 %y = freeze i32 %x @@ -432,15 +430,13 @@ ; X86-LABEL: freeze_ashr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $6, %eax +; X86-NEXT: sarl $9, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $6, %eax +; X64-NEXT: sarl $9, %eax ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -467,30 +463,12 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_ashr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psraw $3, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $3, %xmm2 -; X86-NEXT: psraw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psraw $4, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsraw $1, %xmm0, %xmm1 -; X64-NEXT: vpsraw $3, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsraw $3, %xmm0, %xmm1 -; X64-NEXT: vpsraw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsraw $4, %xmm0, %xmm0 ; X64-NEXT: retq %x = ashr <8 x i16> %a0, %y = freeze <8 x i16> %x @@ -521,15 +499,13 @@ ; X86-LABEL: freeze_lshr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $2, %eax -; X86-NEXT: shrl %eax +; X86-NEXT: shrl $3, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: shrl %eax +; X64-NEXT: shrl $3, %eax ; X64-NEXT: retq %x = lshr i32 %a0, 2 %y = freeze i32 %x @@ -541,15 +517,13 @@ ; X86-LABEL: freeze_lshr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: shrl $5, %eax +; X86-NEXT: shrl $8, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: shrl $5, %eax +; X64-NEXT: shrl $8, %eax ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x @@ -576,30 +550,12 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_lshr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psrlw $2, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $2, %xmm2 -; X86-NEXT: psrlw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psrlw $3, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsrlw $1, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsrlw $2, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsrlw $3, %xmm0, %xmm0 ; X64-NEXT: retq %x = lshr <8 x i16> %a0, %y = freeze <8 x i16> %x diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -281,19 +281,20 @@ ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubw %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg: @@ -309,19 +310,20 @@ ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_reg: @@ -360,19 +362,20 @@ ; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm4 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg: @@ -388,19 +391,20 @@ ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm4 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm4, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg: @@ -442,19 +446,20 @@ ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg: @@ -471,19 +476,20 @@ ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_reg: @@ -525,19 +531,20 @@ ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubw %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: @@ -554,19 +561,20 @@ ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -609,19 +617,20 @@ ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubw %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: @@ -639,19 +648,20 @@ ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_mem: @@ -689,60 +699,68 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_reg: @@ -771,60 +789,70 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 +; AVX512F-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm5, %zmm1 +; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpandnq %zmm1, %zmm5, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg: @@ -855,62 +883,70 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -941,60 +977,68 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm6 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -1027,64 +1071,72 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_mem: