Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2464,6 +2464,30 @@ Complexity += 2; } + // Heuristic: try harder to form an LEA from ADD if the operands set flags. + // Unlike ADD, LEA does not affect flags, so we will be less likely to require + // duplicating flag-producing instructions later in the pipeline. + if (N.getOpcode() == ISD::ADD) { + auto isMathWithFlags = [](SDValue V) { + switch (V.getOpcode()) { + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + return true; + default: + return false; + } + }; + if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) + Complexity++; + } + if (AM.Disp) Complexity++; Index: llvm/test/CodeGen/X86/combine-sbb.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sbb.ll +++ llvm/test/CodeGen/X86/combine-sbb.ll @@ -309,35 +309,25 @@ define i32 @PR40483_sub6(i32*, i32) nounwind { ; X86-LABEL: PR40483_sub6: ; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: subl %edi, %ecx +; X86-NEXT: movl (%edx), %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: subl %edi, %esi -; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: jae .LBB8_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: leal (%ecx,%ecx), %eax ; X86-NEXT: .LBB8_2: -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: PR40483_sub6: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: movl %ecx, %edx -; X64-NEXT: subl %esi, %edx -; X64-NEXT: addl %edx, %edx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: subl %esi, %ecx -; X64-NEXT: movl %ecx, (%rdi) -; X64-NEXT: cmovbl %edx, %eax +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: subl %esi, %eax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: leal (%rax,%rax), %eax +; X64-NEXT: cmovael %ecx, %eax ; X64-NEXT: retq %3 = load i32, i32* %0, align 8 %4 = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %3, i32 %1) Index: llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -17,10 +17,9 @@ ; X64-NEXT: seto %r10b ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %rcx ; X64-NEXT: seto %r11b ; X64-NEXT: orb %r10b, %r11b -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: leaq (%rsi,%rax), %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: addq %rcx, %rdx @@ -55,74 +54,71 @@ ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %esi +; X86-NEXT: leal (%ecx,%eax), %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %ecx, %esi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: leal (%esi,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: mull %edx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %edi, %edx Index: llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll @@ -29,10 +29,9 @@ ; X86-NEXT: seto %cl ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %ch ; X86-NEXT: orb %cl, %ch -; X86-NEXT: addl %edi, %esi +; X86-NEXT: leal (%edi,%eax), %esi ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %edx Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -2164,7 +2164,6 @@ ; SSE2-NEXT: pushq %rbp ; SSE2-NEXT: pushq %r15 ; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %rcx, %rax @@ -2176,60 +2175,57 @@ ; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setne %dl ; SSE2-NEXT: testq %rcx, %rcx -; SSE2-NEXT: setne %r13b -; SSE2-NEXT: andb %dl, %r13b +; SSE2-NEXT: setne %bl +; SSE2-NEXT: andb %dl, %bl ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: seto %bpl ; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r12 -; SSE2-NEXT: movq %rax, %rbx ; SSE2-NEXT: seto %cl ; SSE2-NEXT: orb %bpl, %cl -; SSE2-NEXT: addq %rdi, %rbx +; SSE2-NEXT: leaq (%rdi,%rax), %rbp ; SSE2-NEXT: movq %r12, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: movq %rdx, %r15 -; SSE2-NEXT: addq %rbx, %r15 +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: addq %rbp, %rdi ; SSE2-NEXT: setb %al ; SSE2-NEXT: orb %cl, %al -; SSE2-NEXT: orb %r13b, %al +; SSE2-NEXT: orb %bl, %al ; SSE2-NEXT: movzbl %al, %ebp ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: testq %rsi, %rsi -; SSE2-NEXT: setne %r13b -; SSE2-NEXT: andb %al, %r13b +; SSE2-NEXT: setne %r12b +; SSE2-NEXT: andb %al, %r12b ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: seto %r12b +; SSE2-NEXT: seto %r15b ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %r11 -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: seto %bl -; SSE2-NEXT: orb %r12b, %bl -; SSE2-NEXT: addq %rsi, %rdi +; SSE2-NEXT: seto %cl +; SSE2-NEXT: orb %r15b, %cl +; SSE2-NEXT: addq %rax, %rsi ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: addq %rdi, %rdx -; SSE2-NEXT: setb %cl -; SSE2-NEXT: orb %bl, %cl -; SSE2-NEXT: orb %r13b, %cl -; SSE2-NEXT: movzbl %cl, %ecx +; SSE2-NEXT: addq %rsi, %rdx +; SSE2-NEXT: setb %bl +; SSE2-NEXT: orb %cl, %bl +; SSE2-NEXT: orb %r12b, %bl +; SSE2-NEXT: movzbl %bl, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: pinsrw $4, %ebp, %xmm0 ; SSE2-NEXT: movq %r10, 16(%r14) ; SSE2-NEXT: movq %rax, (%r14) -; SSE2-NEXT: movq %r15, 24(%r14) +; SSE2-NEXT: movq %rdi, 24(%r14) ; SSE2-NEXT: movq %rdx, 8(%r14) ; SSE2-NEXT: psllq $63, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 -; SSE2-NEXT: popq %r13 ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: popq %r15 ; SSE2-NEXT: popq %rbp @@ -2240,7 +2236,6 @@ ; SSSE3-NEXT: pushq %rbp ; SSSE3-NEXT: pushq %r15 ; SSSE3-NEXT: pushq %r14 -; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movq %rcx, %rax @@ -2252,60 +2247,57 @@ ; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setne %dl ; SSSE3-NEXT: testq %rcx, %rcx -; SSSE3-NEXT: setne %r13b -; SSSE3-NEXT: andb %dl, %r13b +; SSSE3-NEXT: setne %bl +; SSSE3-NEXT: andb %dl, %bl ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: seto %bpl ; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r12 -; SSSE3-NEXT: movq %rax, %rbx ; SSSE3-NEXT: seto %cl ; SSSE3-NEXT: orb %bpl, %cl -; SSSE3-NEXT: addq %rdi, %rbx +; SSSE3-NEXT: leaq (%rdi,%rax), %rbp ; SSSE3-NEXT: movq %r12, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: movq %rdx, %r15 -; SSSE3-NEXT: addq %rbx, %r15 +; SSSE3-NEXT: movq %rdx, %rdi +; SSSE3-NEXT: addq %rbp, %rdi ; SSSE3-NEXT: setb %al ; SSSE3-NEXT: orb %cl, %al -; SSSE3-NEXT: orb %r13b, %al +; SSSE3-NEXT: orb %bl, %al ; SSSE3-NEXT: movzbl %al, %ebp ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: testq %rsi, %rsi -; SSSE3-NEXT: setne %r13b -; SSSE3-NEXT: andb %al, %r13b +; SSSE3-NEXT: setne %r12b +; SSSE3-NEXT: andb %al, %r12b ; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: seto %r12b +; SSSE3-NEXT: seto %r15b ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %r11 -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: seto %bl -; SSSE3-NEXT: orb %r12b, %bl -; SSSE3-NEXT: addq %rsi, %rdi +; SSSE3-NEXT: seto %cl +; SSSE3-NEXT: orb %r15b, %cl +; SSSE3-NEXT: addq %rax, %rsi ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: addq %rdi, %rdx -; SSSE3-NEXT: setb %cl -; SSSE3-NEXT: orb %bl, %cl -; SSSE3-NEXT: orb %r13b, %cl -; SSSE3-NEXT: movzbl %cl, %ecx +; SSSE3-NEXT: addq %rsi, %rdx +; SSSE3-NEXT: setb %bl +; SSSE3-NEXT: orb %cl, %bl +; SSSE3-NEXT: orb %r12b, %bl +; SSSE3-NEXT: movzbl %bl, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: pinsrw $4, %ebp, %xmm0 ; SSSE3-NEXT: movq %r10, 16(%r14) ; SSSE3-NEXT: movq %rax, (%r14) -; SSSE3-NEXT: movq %r15, 24(%r14) +; SSSE3-NEXT: movq %rdi, 24(%r14) ; SSSE3-NEXT: movq %rdx, 8(%r14) ; SSSE3-NEXT: psllq $63, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 -; SSSE3-NEXT: popq %r13 ; SSSE3-NEXT: popq %r14 ; SSSE3-NEXT: popq %r15 ; SSSE3-NEXT: popq %rbp @@ -2316,7 +2308,6 @@ ; SSE41-NEXT: pushq %rbp ; SSE41-NEXT: pushq %r15 ; SSE41-NEXT: pushq %r14 -; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: movq %rcx, %rax @@ -2328,60 +2319,57 @@ ; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setne %dl ; SSE41-NEXT: testq %rcx, %rcx -; SSE41-NEXT: setne %r13b -; SSE41-NEXT: andb %dl, %r13b +; SSE41-NEXT: setne %bl +; SSE41-NEXT: andb %dl, %bl ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: seto %bpl ; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r12 -; SSE41-NEXT: movq %rax, %rbx ; SSE41-NEXT: seto %cl ; SSE41-NEXT: orb %bpl, %cl -; SSE41-NEXT: addq %rdi, %rbx +; SSE41-NEXT: leaq (%rdi,%rax), %rbp ; SSE41-NEXT: movq %r12, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: movq %rdx, %r15 -; SSE41-NEXT: addq %rbx, %r15 +; SSE41-NEXT: movq %rdx, %rdi +; SSE41-NEXT: addq %rbp, %rdi ; SSE41-NEXT: setb %al ; SSE41-NEXT: orb %cl, %al -; SSE41-NEXT: orb %r13b, %al +; SSE41-NEXT: orb %bl, %al ; SSE41-NEXT: movzbl %al, %ebp ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setne %r13b -; SSE41-NEXT: andb %al, %r13b +; SSE41-NEXT: setne %r12b +; SSE41-NEXT: andb %al, %r12b ; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: seto %r12b +; SSE41-NEXT: seto %r15b ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %r11 -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: seto %bl -; SSE41-NEXT: orb %r12b, %bl -; SSE41-NEXT: addq %rsi, %rdi +; SSE41-NEXT: seto %cl +; SSE41-NEXT: orb %r15b, %cl +; SSE41-NEXT: addq %rax, %rsi ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: addq %rdi, %rdx -; SSE41-NEXT: setb %cl -; SSE41-NEXT: orb %bl, %cl -; SSE41-NEXT: orb %r13b, %cl -; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: addq %rsi, %rdx +; SSE41-NEXT: setb %bl +; SSE41-NEXT: orb %cl, %bl +; SSE41-NEXT: orb %r12b, %bl +; SSE41-NEXT: movzbl %bl, %ecx ; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: pinsrb $8, %ebp, %xmm0 ; SSE41-NEXT: movq %r10, 16(%r14) ; SSE41-NEXT: movq %rax, (%r14) -; SSE41-NEXT: movq %r15, 24(%r14) +; SSE41-NEXT: movq %rdi, 24(%r14) ; SSE41-NEXT: movq %rdx, 8(%r14) ; SSE41-NEXT: psllq $63, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 -; SSE41-NEXT: popq %r13 ; SSE41-NEXT: popq %r14 ; SSE41-NEXT: popq %r15 ; SSE41-NEXT: popq %rbp @@ -2392,7 +2380,6 @@ ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: pushq %r15 ; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: movq %rcx, %rax @@ -2404,60 +2391,57 @@ ; AVX1-NEXT: testq %r10, %r10 ; AVX1-NEXT: setne %dl ; AVX1-NEXT: testq %rcx, %rcx -; AVX1-NEXT: setne %r13b -; AVX1-NEXT: andb %dl, %r13b +; AVX1-NEXT: setne %bl +; AVX1-NEXT: andb %dl, %bl ; AVX1-NEXT: mulq %r15 ; AVX1-NEXT: movq %rax, %rdi ; AVX1-NEXT: seto %bpl ; AVX1-NEXT: movq %r10, %rax ; AVX1-NEXT: mulq %r12 -; AVX1-NEXT: movq %rax, %rbx ; AVX1-NEXT: seto %cl ; AVX1-NEXT: orb %bpl, %cl -; AVX1-NEXT: addq %rdi, %rbx +; AVX1-NEXT: leaq (%rdi,%rax), %rbp ; AVX1-NEXT: movq %r12, %rax ; AVX1-NEXT: mulq %r15 ; AVX1-NEXT: movq %rax, %r10 -; AVX1-NEXT: movq %rdx, %r15 -; AVX1-NEXT: addq %rbx, %r15 +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: addq %rbp, %rdi ; AVX1-NEXT: setb %al ; AVX1-NEXT: orb %cl, %al -; AVX1-NEXT: orb %r13b, %al +; AVX1-NEXT: orb %bl, %al ; AVX1-NEXT: movzbl %al, %ebp ; AVX1-NEXT: testq %r9, %r9 ; AVX1-NEXT: setne %al ; AVX1-NEXT: testq %rsi, %rsi -; AVX1-NEXT: setne %r13b -; AVX1-NEXT: andb %al, %r13b +; AVX1-NEXT: setne %r12b +; AVX1-NEXT: andb %al, %r12b ; AVX1-NEXT: movq %rsi, %rax ; AVX1-NEXT: mulq %r8 ; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: seto %r12b +; AVX1-NEXT: seto %r15b ; AVX1-NEXT: movq %r9, %rax ; AVX1-NEXT: mulq %r11 -; AVX1-NEXT: movq %rax, %rdi ; AVX1-NEXT: seto %cl -; AVX1-NEXT: orb %r12b, %cl -; AVX1-NEXT: addq %rsi, %rdi +; AVX1-NEXT: orb %r15b, %cl +; AVX1-NEXT: addq %rax, %rsi ; AVX1-NEXT: movq %r11, %rax ; AVX1-NEXT: mulq %r8 -; AVX1-NEXT: addq %rdi, %rdx +; AVX1-NEXT: addq %rsi, %rdx ; AVX1-NEXT: setb %bl ; AVX1-NEXT: orb %cl, %bl -; AVX1-NEXT: orb %r13b, %bl +; AVX1-NEXT: orb %r12b, %bl ; AVX1-NEXT: movzbl %bl, %ecx ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 ; AVX1-NEXT: movq %r10, 16(%r14) ; AVX1-NEXT: movq %rax, (%r14) -; AVX1-NEXT: movq %r15, 24(%r14) +; AVX1-NEXT: movq %rdi, 24(%r14) ; AVX1-NEXT: movq %rdx, 8(%r14) ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 ; AVX1-NEXT: popq %r14 ; AVX1-NEXT: popq %r15 ; AVX1-NEXT: popq %rbp @@ -2468,7 +2452,6 @@ ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: movq %rcx, %rax @@ -2480,60 +2463,57 @@ ; AVX2-NEXT: testq %r10, %r10 ; AVX2-NEXT: setne %dl ; AVX2-NEXT: testq %rcx, %rcx -; AVX2-NEXT: setne %r13b -; AVX2-NEXT: andb %dl, %r13b +; AVX2-NEXT: setne %bl +; AVX2-NEXT: andb %dl, %bl ; AVX2-NEXT: mulq %r15 ; AVX2-NEXT: movq %rax, %rdi ; AVX2-NEXT: seto %bpl ; AVX2-NEXT: movq %r10, %rax ; AVX2-NEXT: mulq %r12 -; AVX2-NEXT: movq %rax, %rbx ; AVX2-NEXT: seto %cl ; AVX2-NEXT: orb %bpl, %cl -; AVX2-NEXT: addq %rdi, %rbx +; AVX2-NEXT: leaq (%rdi,%rax), %rbp ; AVX2-NEXT: movq %r12, %rax ; AVX2-NEXT: mulq %r15 ; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq %rdx, %r15 -; AVX2-NEXT: addq %rbx, %r15 +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: addq %rbp, %rdi ; AVX2-NEXT: setb %al ; AVX2-NEXT: orb %cl, %al -; AVX2-NEXT: orb %r13b, %al +; AVX2-NEXT: orb %bl, %al ; AVX2-NEXT: movzbl %al, %ebp ; AVX2-NEXT: testq %r9, %r9 ; AVX2-NEXT: setne %al ; AVX2-NEXT: testq %rsi, %rsi -; AVX2-NEXT: setne %r13b -; AVX2-NEXT: andb %al, %r13b +; AVX2-NEXT: setne %r12b +; AVX2-NEXT: andb %al, %r12b ; AVX2-NEXT: movq %rsi, %rax ; AVX2-NEXT: mulq %r8 ; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: seto %r12b +; AVX2-NEXT: seto %r15b ; AVX2-NEXT: movq %r9, %rax ; AVX2-NEXT: mulq %r11 -; AVX2-NEXT: movq %rax, %rdi ; AVX2-NEXT: seto %cl -; AVX2-NEXT: orb %r12b, %cl -; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: orb %r15b, %cl +; AVX2-NEXT: addq %rax, %rsi ; AVX2-NEXT: movq %r11, %rax ; AVX2-NEXT: mulq %r8 -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: addq %rsi, %rdx ; AVX2-NEXT: setb %bl ; AVX2-NEXT: orb %cl, %bl -; AVX2-NEXT: orb %r13b, %bl +; AVX2-NEXT: orb %r12b, %bl ; AVX2-NEXT: movzbl %bl, %ecx ; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0 ; AVX2-NEXT: movq %r10, 16(%r14) ; AVX2-NEXT: movq %rax, (%r14) -; AVX2-NEXT: movq %r15, 24(%r14) +; AVX2-NEXT: movq %rdi, 24(%r14) ; AVX2-NEXT: movq %rdx, 8(%r14) ; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: popq %rbp @@ -2544,7 +2524,6 @@ ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 ; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: movq %rcx, %rax @@ -2556,25 +2535,24 @@ ; AVX512-NEXT: testq %r10, %r10 ; AVX512-NEXT: setne %dl ; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: setne %r13b -; AVX512-NEXT: andb %dl, %r13b +; AVX512-NEXT: setne %bl +; AVX512-NEXT: andb %dl, %bl ; AVX512-NEXT: mulq %r15 ; AVX512-NEXT: movq %rax, %rdi ; AVX512-NEXT: seto %bpl ; AVX512-NEXT: movq %r10, %rax ; AVX512-NEXT: mulq %r12 -; AVX512-NEXT: movq %rax, %rbx ; AVX512-NEXT: seto %cl ; AVX512-NEXT: orb %bpl, %cl -; AVX512-NEXT: addq %rdi, %rbx +; AVX512-NEXT: leaq (%rdi,%rax), %rbp ; AVX512-NEXT: movq %r12, %rax ; AVX512-NEXT: mulq %r15 ; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: addq %rbx, %r15 +; AVX512-NEXT: movq %rdx, %rdi +; AVX512-NEXT: addq %rbp, %rdi ; AVX512-NEXT: setb %al ; AVX512-NEXT: orb %cl, %al -; AVX512-NEXT: orb %r13b, %al +; AVX512-NEXT: orb %bl, %al ; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftlw $1, %k0, %k0 ; AVX512-NEXT: testq %r9, %r9 @@ -2588,13 +2566,12 @@ ; AVX512-NEXT: seto %bpl ; AVX512-NEXT: movq %r9, %rax ; AVX512-NEXT: mulq %r11 -; AVX512-NEXT: movq %rax, %rdi ; AVX512-NEXT: seto %bl ; AVX512-NEXT: orb %bpl, %bl -; AVX512-NEXT: addq %rsi, %rdi +; AVX512-NEXT: addq %rax, %rsi ; AVX512-NEXT: movq %r11, %rax ; AVX512-NEXT: mulq %r8 -; AVX512-NEXT: addq %rdi, %rdx +; AVX512-NEXT: addq %rsi, %rdx ; AVX512-NEXT: setb %sil ; AVX512-NEXT: orb %bl, %sil ; AVX512-NEXT: orb %cl, %sil @@ -2603,13 +2580,12 @@ ; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: movq %r10, 16(%r14) ; AVX512-NEXT: movq %rax, (%r14) -; AVX512-NEXT: movq %r15, 24(%r14) +; AVX512-NEXT: movq %rdi, 24(%r14) ; AVX512-NEXT: movq %rdx, 8(%r14) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 ; AVX512-NEXT: popq %rbp