diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4798,6 +4798,15 @@ return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()); + case ISD::SRA: + case ISD::SRL: + // If the max shift amount isn't in range, then the shift can create poison. + if (!getValidMaximumShiftAmountConstant(Op, DemandedElts)) + return true; + + // Matches hasPoisonGeneratingFlags(). + return ConsiderFlags && Op->getFlags().hasExact(); + default: // Allow the target to implement this method for its nodes. if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -973,16 +973,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] ; GCN-IR-NEXT: s_mov_b32 s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1016,17 +1016,17 @@ ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 -; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s7, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 +; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s6, s1, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_mov_b32 s7, s6 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_sub_u32 s8, s6, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 +; GCN-IR-NEXT: s_sub_u32 s8, s8, s6 +; GCN-IR-NEXT: s_subb_u32 s9, s9, s6 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] @@ -1160,16 +1160,12 @@ ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 24 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[4:5], 16 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 16 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[4:5], 16 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 16 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 40 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 ; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 16 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 ; GCN-IR-NEXT: s_mov_b32 s3, s2 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -792,8 +792,6 @@ ; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 -; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -642,8 +642,8 @@ ; RV32I-LABEL: zext16_abs8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: srai a2, a0, 31 +; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: sh a0, 0(a1) @@ -660,8 +660,8 @@ ; RV64I-LABEL: zext16_abs8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: srai a2, a0, 63 +; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: xor a0, a0, a2 ; RV64I-NEXT: subw a0, a0, a2 ; RV64I-NEXT: sh a0, 0(a1) diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -252,15 +252,17 @@ ; RV64I-NEXT: .cfi_def_cfa_offset 16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: .cfi_offset ra, -8 -; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 +; RV64I-NEXT: srli a1, a0, 2 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 +; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: slli a1, a0, 33 +; RV64I-NEXT: srli a1, a1, 41 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 +; RV64I-NEXT: slli a1, a0, 33 +; RV64I-NEXT: srli a1, a1, 49 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: srli a1, a0, 1 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,142 +177,143 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $156, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: subl $152, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: xorl %ecx, %ebp +; X86-NEXT: movl %esi, %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edi, %ebx -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: orl %esi, %eax +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: orl %ebp, %eax ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl %edx, %edi ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %ebx, %ebx ; X86-NEXT: xorl $31, %ebx ; X86-NEXT: addl $32, %ebx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ebx ; X86-NEXT: addl $64, %ebx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebp, %edx ; X86-NEXT: cmovnel %ecx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: bsrl %esi, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %edi, %edi +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %esi -; X86-NEXT: xorl $31, %esi +; X86-NEXT: bsrl %eax, %edi +; X86-NEXT: xorl $31, %edi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: cmovnel %edi, %edx ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: orl %ebp, %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: subl %edx, %ebx -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $127, %ecx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl %ebx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: cmovnel %ebp, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovnel %ebp, %edi -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: cmovnel %ebp, %esi +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: cmovnel %ebp, %eax ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl $127, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -327,47 +328,44 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 148(%esp,%eax), %edx -; X86-NEXT: movl 152(%esp,%eax), %ebx +; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movl 144(%esp,%esi), %edx +; X86-NEXT: movl 148(%esp,%esi), %ebx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %ebx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 144(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl 140(%esp,%esi), %eax +; X86-NEXT: movl %eax, %edi ; X86-NEXT: shrl %edi ; X86-NEXT: shrl %cl, %edi ; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl 140(%esp,%eax), %eax +; X86-NEXT: movl 136(%esp,%esi), %edx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %eax, %ebp -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: jmp .LBB4_9 ; X86-NEXT: .LBB4_2: # %udiv-preheader ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -382,27 +380,28 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %al, %ch +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movb %dl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 104(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: movl 100(%esp,%edx), %esi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%edx), %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl 92(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: shrdl %cl, %esi, %edx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl 88(%esp,%ebp), %ebp +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl ; X86-NEXT: addl %ebx, %ebx @@ -410,8 +409,10 @@ ; X86-NEXT: orl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: shrdl %cl, %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -421,179 +422,173 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: shldl $1, %edx, (%esp) # 4-byte Folded Spill +; X86-NEXT: shldl $1, %ebp, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: shldl $1, %ecx, %ebp ; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl $1, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %edx +; X86-NEXT: subl %ecx, %ebp +; X86-NEXT: sbbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edi, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl $-1, %esi -; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit ; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: addl %edx, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: xorl %eax, %ebp +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: orl %ecx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: subl %eax, %esi +; X86-NEXT: shldl $1, %esi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: .LBB4_9: # %udiv-end +; X86-NEXT: xorl %ebp, %ebx +; X86-NEXT: xorl %ebp, %edi +; X86-NEXT: xorl %ebp, %eax +; X86-NEXT: xorl %ebp, %esi +; X86-NEXT: subl %ebp, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %ebp, 4(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 12(%edx) -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: sbbl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 12(%ecx) ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: setb %bl -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %eax, %ebx +; X86-NEXT: imull %eax, %edi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: imull %esi, %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebx +; X86-NEXT: imull %edx, %edi ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $156, %esp +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -452,15 +452,13 @@ ; X86-LABEL: freeze_ashr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $3, %eax +; X86-NEXT: sarl $6, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $3, %eax +; X64-NEXT: sarl $6, %eax ; X64-NEXT: retq %x = ashr i32 %a0, 3 %y = freeze i32 %x @@ -472,15 +470,13 @@ ; X86-LABEL: freeze_ashr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: sarl $6, %eax +; X86-NEXT: sarl $9, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: sarl $3, %eax -; X64-NEXT: sarl $6, %eax +; X64-NEXT: sarl $9, %eax ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -507,30 +503,12 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_ashr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psraw $3, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psraw $3, %xmm2 -; X86-NEXT: psraw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psraw $4, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsraw $1, %xmm0, %xmm1 -; X64-NEXT: vpsraw $3, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsraw $3, %xmm0, %xmm1 -; X64-NEXT: vpsraw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsraw $4, %xmm0, %xmm0 ; X64-NEXT: retq %x = ashr <8 x i16> %a0, %y = freeze <8 x i16> %x @@ -561,15 +539,13 @@ ; X86-LABEL: freeze_lshr: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $2, %eax -; X86-NEXT: shrl %eax +; X86-NEXT: shrl $3, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $2, %eax -; X64-NEXT: shrl %eax +; X64-NEXT: shrl $3, %eax ; X64-NEXT: retq %x = lshr i32 %a0, 2 %y = freeze i32 %x @@ -581,15 +557,13 @@ ; X86-LABEL: freeze_lshr_exact: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: shrl $5, %eax +; X86-NEXT: shrl $8, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: shrl $5, %eax +; X64-NEXT: shrl $8, %eax ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x @@ -616,30 +590,12 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind { ; X86-LABEL: freeze_lshr_vec: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $1, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pandn %xmm2, %xmm3 -; X86-NEXT: psrlw $2, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: por %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psrlw $2, %xmm2 -; X86-NEXT: psrlw $1, %xmm0 -; X86-NEXT: pand %xmm1, %xmm0 -; X86-NEXT: pandn %xmm2, %xmm1 -; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: psrlw $3, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_vec: ; X64: # %bb.0: -; X64-NEXT: vpsrlw $1, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $2, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; X64-NEXT: vpsrlw $2, %xmm0, %xmm1 -; X64-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; X64-NEXT: vpsrlw $3, %xmm0, %xmm0 ; X64-NEXT: retq %x = lshr <8 x i16> %a0, %y = freeze <8 x i16> %x diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -281,13 +281,15 @@ ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -308,13 +310,15 @@ ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -358,13 +362,15 @@ ; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm4 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -385,13 +391,15 @@ ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm4 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -438,13 +446,15 @@ ; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 @@ -466,13 +476,15 @@ ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 @@ -519,13 +531,15 @@ ; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -547,13 +561,15 @@ ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 @@ -601,13 +617,15 @@ ; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 @@ -630,13 +648,15 @@ ; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 @@ -679,60 +699,68 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_reg: @@ -761,60 +789,70 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm4, %ymm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 +; AVX512F-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm5, %zmm1 +; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $216, %zmm5, %zmm1, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 +; AVX512VL-FALLBACK-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpandnq %zmm1, %zmm5, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg: @@ -845,62 +883,70 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -931,60 +977,68 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm5, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -1017,64 +1071,72 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm5, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_mem: