diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6321,17 +6321,20 @@ // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate -// a rot[lr]. +// a rot[lr]. This also matches funnel shift patterns, similar to rotation but +// with different shifted sources. SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // Must be a legal type. Expanded 'n promoted things won't work with rotates. EVT VT = LHS.getValueType(); if (!TLI.isTypeLegal(VT)) return SDValue(); - // The target must have at least one rotate flavor. + // The target must have at least one rotate/funnel flavor. bool HasROTL = hasOperation(ISD::ROTL, VT); bool HasROTR = hasOperation(ISD::ROTR, VT); - if (!HasROTL && !HasROTR) + bool HasFSHL = hasOperation(ISD::FSHL, VT); + bool HasFSHR = hasOperation(ISD::FSHR, VT); + if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR) return SDValue(); // Check for truncated rotate. @@ -6381,12 +6384,13 @@ // At this point we've matched or extracted a shift op on each side. - if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) - return SDValue(); // Not shifting the same value. - if (LHSShift.getOpcode() == RHSShift.getOpcode()) return SDValue(); // Shifts must disagree. + bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0); + if (!IsRotate && !(HasFSHL || HasFSHR)) + return SDValue(); // Requires funnel shift support. + // Canonicalize shl to left side in a shl/srl pair. if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); @@ -6402,13 +6406,23 @@ // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) + // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) + // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; }; if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { - SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, - LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); + SDValue Res; + if (IsRotate && (HasROTL || HasROTR)) + Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, + LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); + else if (HasFSHL) + Res = DAG.getNode(ISD::FSHL, DL, VT, LHSShiftArg, RHSShiftArg, + LHSShiftAmt); + else + Res = DAG.getNode(ISD::FSHR, DL, VT, LHSShiftArg, RHSShiftArg, + RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { @@ -6426,12 +6440,16 @@ DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); } - Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); + Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask); } - return Rot; + return Res; } + // TODO: Handle variable funnel shifts. + if (!IsRotate) + return SDValue(); + // If there is a mask here, and we have a variable shift, we can't be sure // that we're masking out the right stuff. if (LHSMask.getNode() || RHSMask.getNode()) diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1894,178 +1894,223 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $56, %rsp ; SSE2-NEXT: movaps (%rdi), %xmm1 ; SSE2-NEXT: movaps (%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq %rax, %r9 +; SSE2-NEXT: movq %r9, %rbx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: addq %rdi, %rax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addq %rcx, %rdi +; SSE2-NEXT: movq %rdi, %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: addq %rbp, %rdi +; SSE2-NEXT: movq %rdi, %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: addq %rsi, %r12 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: addq %r11, %rbp -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: addq %r10, %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: addq %r9, %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: addq %r8, %r11 +; SSE2-NEXT: addq %rdx, %rsi +; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: addq %r13, %rdx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq %r8, %rcx +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq %r10, %rcx +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq %r15, %rcx +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: addq %rdx, %r10 +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: addq %rcx, %r8 +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: addq %rax, %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r15,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r12,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: leaq -1(%r13,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rax +; SSE2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; SSE2-NEXT: addq $-1, %rbx +; SSE2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %ebp +; SSE2-NEXT: adcq $-1, %rbp +; SSE2-NEXT: addq $-1, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: leaq -1(%rax,%rsi), %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: addq $-1, %rbp -; SSE2-NEXT: movl $0, %r9d -; SSE2-NEXT: adcq $-1, %r9 +; SSE2-NEXT: movl $0, %ebx +; SSE2-NEXT: adcq $-1, %rbx ; SSE2-NEXT: addq $-1, %r14 -; SSE2-NEXT: movl $0, %esi -; SSE2-NEXT: adcq $-1, %rsi -; SSE2-NEXT: addq $-1, %rbx +; SSE2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r13d +; SSE2-NEXT: adcq $-1, %r13 +; SSE2-NEXT: addq $-1, %r11 +; SSE2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: adcq $-1, %r11 +; SSE2-NEXT: addq $-1, %r12 +; SSE2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movl $0, %eax ; SSE2-NEXT: adcq $-1, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: addq $-1, %r11 +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %r15d +; SSE2-NEXT: adcq $-1, %r15 +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movl $0, %r12d ; SSE2-NEXT: adcq $-1, %r12 +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %r14d +; SSE2-NEXT: adcq $-1, %r14 +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill +; SSE2-NEXT: addq $-1, %rcx +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: addq $-1, %r10 -; SSE2-NEXT: movl $0, %r13d -; SSE2-NEXT: adcq $-1, %r13 +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: adcq $-1, %rsi +; SSE2-NEXT: addq $-1, %r9 +; SSE2-NEXT: movl $0, %edx +; SSE2-NEXT: adcq $-1, %rdx ; SSE2-NEXT: addq $-1, %r8 -; SSE2-NEXT: movl $0, %r15d -; SSE2-NEXT: adcq $-1, %r15 -; SSE2-NEXT: addq $-1, %rdi ; SSE2-NEXT: movl $0, %ecx ; SSE2-NEXT: adcq $-1, %rcx -; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: addq $-1, %rdi ; SSE2-NEXT: movl $0, %eax ; SSE2-NEXT: adcq $-1, %rax -; SSE2-NEXT: shldq $63, %rdx, %rax -; SSE2-NEXT: shldq $63, %rdi, %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shldq $63, %r8, %r15 -; SSE2-NEXT: shldq $63, %r10, %r13 -; SSE2-NEXT: shldq $63, %r11, %r12 +; SSE2-NEXT: shldq $63, %rdi, %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: shldq $63, %r8, %rcx +; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: shldq $63, %r9, %rdx +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: shldq $63, %r10, %rsi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: movq (%rsp), %r8 # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: shldq $63, %rbx, %rdi -; SSE2-NEXT: shldq $63, %r14, %rsi -; SSE2-NEXT: shldq $63, %rbp, %r9 -; SSE2-NEXT: movq %r9, %xmm8 -; SSE2-NEXT: movq %rsi, %xmm15 +; SSE2-NEXT: shldq $63, %rcx, %rdi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: shldq $63, %rcx, %r14 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: shldq $63, %rax, %r12 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm9 +; SSE2-NEXT: shldq $63, %rcx, %r15 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: shldq $63, %rcx, %rax ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm10 +; SSE2-NEXT: shldq $63, %rcx, %r11 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm4 +; SSE2-NEXT: shldq $63, %rcx, %r13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm11 +; SSE2-NEXT: shldq $63, %rcx, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: movq %rcx, %xmm7 -; SSE2-NEXT: movq %rdi, %xmm12 -; SSE2-NEXT: movq %r12, %xmm0 -; SSE2-NEXT: movq %r13, %xmm13 -; SSE2-NEXT: movq %r15, %xmm6 -; SSE2-NEXT: movq %rdx, %xmm14 -; SSE2-NEXT: movq %rax, %xmm5 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE2-NEXT: shldq $63, %rcx, %rbp +; SSE2-NEXT: movq %rbp, %xmm8 +; SSE2-NEXT: movq %rbx, %xmm0 +; SSE2-NEXT: movq %r13, %xmm9 +; SSE2-NEXT: movq %r11, %xmm2 +; SSE2-NEXT: movq %rax, %xmm10 +; SSE2-NEXT: movq %r15, %xmm4 +; SSE2-NEXT: movq %r12, %xmm11 +; SSE2-NEXT: movq %r14, %xmm7 +; SSE2-NEXT: movq %rdx, %xmm12 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: movq %r8, %xmm13 +; SSE2-NEXT: movq %r10, %xmm6 +; SSE2-NEXT: movq %rsi, %xmm14 +; SSE2-NEXT: movq %r9, %xmm5 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 8-byte Folded Reload +; SSE2-NEXT: # xmm15 = mem[0],zero +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Folded Reload +; SSE2-NEXT: # xmm3 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,1,2,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm8 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; SSE2-NEXT: por %xmm8, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm8 +; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; SSE2-NEXT: psllq $48, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: pandn %xmm6, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: psllq $48, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; SSE2-NEXT: movups %xmm2, (%rax) +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE2-NEXT: movupd %xmm1, (%rax) +; SSE2-NEXT: addq $56, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -2082,182 +2127,239 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpextrq $1, %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-NEXT: vmovq %xmm6, %r10 -; AVX1-NEXT: vpextrq $1, %xmm6, %r9 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero -; AVX1-NEXT: vmovq %xmm7, %r8 -; AVX1-NEXT: vpextrq $1, %xmm7, %rdi -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm6, %rcx -; AVX1-NEXT: vmovq %xmm6, %r14 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm6, %rax +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm6, %rdi ; AVX1-NEXT: vmovq %xmm6, %rbp ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-NEXT: vpextrq $1, %xmm5, %r11 -; AVX1-NEXT: vmovq %xmm5, %r15 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm5, %rbx +; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm5, %rdx +; AVX1-NEXT: vmovq %xmm5, %rcx +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm6, %r13 +; AVX1-NEXT: vmovq %xmm6, %r12 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-NEXT: vpextrq $1, %xmm5, %r10 +; AVX1-NEXT: vmovq %xmm5, %r14 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm5, %r9 +; AVX1-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-NEXT: vpextrq $1, %xmm4, %rbx -; AVX1-NEXT: vmovq %xmm4, %rdx -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: addq %rcx, %rsi -; AVX1-NEXT: vmovq %xmm0, %r13 -; AVX1-NEXT: addq %r14, %r13 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %r12 -; AVX1-NEXT: addq %rax, %r12 -; AVX1-NEXT: vmovq %xmm0, %r14 -; AVX1-NEXT: addq %rbp, %r14 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero -; AVX1-NEXT: vpextrq $1, %xmm0, %rbp -; AVX1-NEXT: addq %r11, %rbp -; AVX1-NEXT: vmovq %xmm0, %r11 -; AVX1-NEXT: addq %r15, %r11 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vpextrq $1, %xmm0, %r15 -; AVX1-NEXT: addq %rbx, %r15 -; AVX1-NEXT: vmovq %xmm0, %rbx -; AVX1-NEXT: addq %rdx, %rbx -; AVX1-NEXT: vpextrq $1, %xmm6, %rax -; AVX1-NEXT: leaq -1(%rdi,%rax), %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vmovq %xmm6, %rax -; AVX1-NEXT: leaq -1(%r8,%rax), %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm5, %rax -; AVX1-NEXT: leaq -1(%r9,%rax), %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq %rdi, %rax +; AVX1-NEXT: movq %rax, %rdi ; AVX1-NEXT: vmovq %xmm5, %rax -; AVX1-NEXT: leaq -1(%r10,%rax), %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq %rbp, %rax +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpextrq $1, %xmm4, %r15 +; AVX1-NEXT: addq %rbx, %r15 +; AVX1-NEXT: vmovq %xmm4, %r11 +; AVX1-NEXT: addq %rsi, %r11 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm4, %rax -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: leaq -1(%rcx,%rax), %rax +; AVX1-NEXT: addq %rdx, %rax +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: vmovq %xmm4, %r8 +; AVX1-NEXT: addq %rcx, %r8 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm5, %rsi +; AVX1-NEXT: addq %r13, %rsi +; AVX1-NEXT: vmovq %xmm5, %rax +; AVX1-NEXT: addq %r12, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: addq %r10, %rax +; AVX1-NEXT: movq %rax, %r10 ; AVX1-NEXT: vmovq %xmm4, %rax -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: leaq -1(%rcx,%rax), %rax +; AVX1-NEXT: addq %r14, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrq $1, %xmm8, %rax -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: addq %r9, %rax +; AVX1-NEXT: movq %rax, %r13 +; AVX1-NEXT: vmovq %xmm2, %rbx +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vmovq %xmm8, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: leaq -1(%rax,%rcx), %rax +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: movq %rcx, %r9 +; AVX1-NEXT: addq $-1, %rdi +; AVX1-NEXT: movq %rdi, (%rsp) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rbp +; AVX1-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r15 +; AVX1-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %r10d, %r10d +; AVX1-NEXT: addq $-1, %r11 +; AVX1-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %eax +; AVX1-NEXT: adcq $-1, %rax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r8 +; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r12d +; AVX1-NEXT: adcq $-1, %r12 ; AVX1-NEXT: addq $-1, %rsi ; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: movl $0, %ecx -; AVX1-NEXT: adcq $-1, %rcx -; AVX1-NEXT: addq $-1, %r13 ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: addq $-1, %r12 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: addq $-1, %rax +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: addq $-1, %r10 +; AVX1-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %edx +; AVX1-NEXT: adcq $-1, %rdx +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %r15d +; AVX1-NEXT: adcq $-1, %r15 +; AVX1-NEXT: addq $-1, %r13 +; AVX1-NEXT: movl $0, %r14d +; AVX1-NEXT: adcq $-1, %r14 +; AVX1-NEXT: addq $-1, %rbx +; AVX1-NEXT: movl $0, %r11d +; AVX1-NEXT: adcq $-1, %r11 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %r8d +; AVX1-NEXT: adcq $-1, %r8 +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %edi ; AVX1-NEXT: adcq $-1, %rdi -; AVX1-NEXT: addq $-1, %r14 -; AVX1-NEXT: movl $0, %esi -; AVX1-NEXT: adcq $-1, %rsi +; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: movq %r9, %rbp ; AVX1-NEXT: addq $-1, %rbp ; AVX1-NEXT: movl $0, %r9d ; AVX1-NEXT: adcq $-1, %r9 -; AVX1-NEXT: addq $-1, %r11 -; AVX1-NEXT: movl $0, %r8d -; AVX1-NEXT: adcq $-1, %r8 -; AVX1-NEXT: addq $-1, %r15 -; AVX1-NEXT: movl $0, %edx -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: addq $-1, %rbx -; AVX1-NEXT: adcq $-1, %r10 -; AVX1-NEXT: shldq $63, %r11, %r8 -; AVX1-NEXT: shldq $63, %rbp, %r9 -; AVX1-NEXT: shldq $63, %r14, %rsi -; AVX1-NEXT: shldq $63, %r12, %rdi -; AVX1-NEXT: shldq $63, %r13, %rax -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX1-NEXT: shldq $63, %rbp, %rcx -; AVX1-NEXT: shldq $63, %rbx, %r10 -; AVX1-NEXT: shldq $63, %r15, %rdx -; AVX1-NEXT: vmovq %rcx, %xmm8 -; AVX1-NEXT: vmovq %rax, %xmm9 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %rdi, %xmm12 -; AVX1-NEXT: vmovq %rsi, %xmm13 -; AVX1-NEXT: vmovq %rdx, %xmm14 -; AVX1-NEXT: vmovq %r10, %xmm15 -; AVX1-NEXT: vmovq %r9, %xmm10 -; AVX1-NEXT: vmovq %r8, %xmm1 +; AVX1-NEXT: shldq $63, %rbx, %r11 +; AVX1-NEXT: shldq $63, %r13, %r14 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %r15 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: shldq $63, %rax, %rsi +; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: shldq $63, %rax, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: shldq $63, %rax, %r12 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: shldq $63, %rax, %rsi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm5 +; AVX1-NEXT: shldq $63, %rax, %rdx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm6 +; AVX1-NEXT: shldq $63, %rax, %rcx ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm7 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbx, %rax +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: movq (%rsp), %r13 # 8-byte Reload +; AVX1-NEXT: shldq $63, %r13, %rbx +; AVX1-NEXT: shldq $63, %rbp, %r9 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbp, %r10 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbp, %rdi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX1-NEXT: shldq $63, %rbp, %r8 +; AVX1-NEXT: vmovq %rbx, %xmm8 +; AVX1-NEXT: vmovq %rax, %xmm9 +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %rsi, %xmm12 +; AVX1-NEXT: vmovq %r12, %xmm13 +; AVX1-NEXT: vmovq %r8, %xmm14 +; AVX1-NEXT: vmovq %rdi, %xmm15 +; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 8-byte Folded Reload +; AVX1-NEXT: # xmm10 = mem[0],zero +; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 8-byte Folded Reload +; AVX1-NEXT: # xmm11 = mem[0],zero +; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Folded Reload +; AVX1-NEXT: # xmm2 = mem[0],zero +; AVX1-NEXT: vmovq %r15, %xmm3 +; AVX1-NEXT: vmovq %r14, %xmm4 +; AVX1-NEXT: vmovq %r11, %xmm5 +; AVX1-NEXT: vmovq %r10, %xmm6 +; AVX1-NEXT: vmovq %r9, %xmm7 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX1-NEXT: vpsllq $48, %xmm8, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,1,2,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm8[6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-NEXT: vpslld $16, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: addq $8, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 ; AVX1-NEXT: popq %r13 @@ -2515,122 +2617,206 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: subq $16, %rsp ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vpextrq $1, %xmm3, %rbx +; AVX512F-NEXT: vmovq %xmm3, %rbp +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-NEXT: vmovq %xmm3, %rdi +; AVX512F-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vmovq %xmm2, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm2, %r9 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %r13 +; AVX512F-NEXT: vpextrq $1, %xmm2, %r11 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512F-NEXT: vmovq %xmm5, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm4, %rax -; AVX512F-NEXT: vmovq %xmm4, %rbx -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-NEXT: vpextrq $1, %xmm4, %rdi -; AVX512F-NEXT: vmovq %xmm4, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm1, %r13 -; AVX512F-NEXT: vmovq %xmm1, %r15 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512F-NEXT: vmovq %xmm2, %r14 -; AVX512F-NEXT: vpextrq $1, %xmm1, %r11 +; AVX512F-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: vpextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512F-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 -; AVX512F-NEXT: vmovq %xmm1, %r9 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vpextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: addq %rbx, %rax +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: vmovq %xmm3, %r8 +; AVX512F-NEXT: addq %rbp, %r8 ; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-NEXT: vpextrq $1, %xmm5, %rbp -; AVX512F-NEXT: leal -1(%rdx,%rbp), %edx -; AVX512F-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512F-NEXT: vmovq %xmm5, %rbp -; AVX512F-NEXT: leal -1(%rcx,%rbp), %ecx -; AVX512F-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512F-NEXT: leal -1(%rax,%rbp), %eax -; AVX512F-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512F-NEXT: vmovq %xmm4, %rbp -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-NEXT: leal -1(%rbx,%rbp), %r8d -; AVX512F-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512F-NEXT: leal -1(%rdi,%rbp), %edi -; AVX512F-NEXT: vmovq %xmm4, %rbp -; AVX512F-NEXT: leal -1(%rsi,%rbp), %esi -; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp -; AVX512F-NEXT: leal -1(%r13,%rbp), %r13d -; AVX512F-NEXT: vmovq %xmm3, %rbp +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: addq %rdi, %rax +; AVX512F-NEXT: movq %rax, %rdi +; AVX512F-NEXT: vpextrq $1, %xmm3, %r12 +; AVX512F-NEXT: addq %rsi, %r12 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: leal -1(%r15,%rbp), %r15d -; AVX512F-NEXT: vpextrq $1, %xmm3, %rbp -; AVX512F-NEXT: leal -1(%r12,%rbp), %r12d -; AVX512F-NEXT: vmovq %xmm3, %rbp -; AVX512F-NEXT: leal -1(%r14,%rbp), %r14d +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: addq %rcx, %rax +; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512F-NEXT: leal -1(%r11,%rdx), %r11d -; AVX512F-NEXT: vmovq %xmm2, %rbp -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: leal -1(%rax,%rbp), %ebp -; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx -; AVX512F-NEXT: leal -1(%r10,%rcx), %ecx +; AVX512F-NEXT: addq %r9, %rdx +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %r10 +; AVX512F-NEXT: addq %r13, %r10 +; AVX512F-NEXT: vpextrq $1, %xmm2, %r15 +; AVX512F-NEXT: addq %r11, %r15 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: addq %r14, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: leal -1(%r9,%rax), %eax -; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm1, %r10 -; AVX512F-NEXT: leal -1(%rdx,%r10), %edx -; AVX512F-NEXT: vmovq %xmm0, %r10 -; AVX512F-NEXT: vmovq %xmm1, %r9 -; AVX512F-NEXT: leaq -1(%r10,%r9), %rbx -; AVX512F-NEXT: shrq %rbx -; AVX512F-NEXT: vmovd %ebx, %xmm0 -; AVX512F-NEXT: shrl %edx -; AVX512F-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %eax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %ecx -; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %ebp -; AVX512F-NEXT: vpinsrb $4, %ebp, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r11d -; AVX512F-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r14d -; AVX512F-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r12d +; AVX512F-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vmovq %xmm0, %rbp +; AVX512F-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX512F-NEXT: vpextrq $1, %xmm0, %r13 +; AVX512F-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %r11 +; AVX512F-NEXT: addq %rax, %r11 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: addq %rax, %rsi +; AVX512F-NEXT: addq $-1, %rbx +; AVX512F-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %r8 +; AVX512F-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %rdi +; AVX512F-NEXT: movq %rdi, (%rsp) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %r12 +; AVX512F-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %rcx +; AVX512F-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %rdx +; AVX512F-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %eax +; AVX512F-NEXT: adcq $-1, %rax +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: addq $-1, %r10 +; AVX512F-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %ecx +; AVX512F-NEXT: adcq $-1, %rcx +; AVX512F-NEXT: addq $-1, %r15 +; AVX512F-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movl $0, %r12d +; AVX512F-NEXT: adcq $-1, %r12 +; AVX512F-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: movl $0, %r15d +; AVX512F-NEXT: adcq $-1, %r15 +; AVX512F-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: movl $0, %r14d +; AVX512F-NEXT: adcq $-1, %r14 +; AVX512F-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512F-NEXT: movl $0, %ebx +; AVX512F-NEXT: adcq $-1, %rbx +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: addq $-1, %rax +; AVX512F-NEXT: movl $0, %r10d +; AVX512F-NEXT: adcq $-1, %r10 +; AVX512F-NEXT: addq $-1, %rbp +; AVX512F-NEXT: movl $0, %r9d +; AVX512F-NEXT: adcq $-1, %r9 +; AVX512F-NEXT: addq $-1, %r13 +; AVX512F-NEXT: movl $0, %r8d +; AVX512F-NEXT: adcq $-1, %r8 +; AVX512F-NEXT: addq $-1, %r11 +; AVX512F-NEXT: movl $0, %edi +; AVX512F-NEXT: adcq $-1, %rdi +; AVX512F-NEXT: addq $-1, %rsi +; AVX512F-NEXT: movl $0, %edx +; AVX512F-NEXT: adcq $-1, %rdx +; AVX512F-NEXT: shldq $63, %rsi, %rdx +; AVX512F-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: shldq $63, %r11, %rdi +; AVX512F-NEXT: shldq $63, %r13, %r8 +; AVX512F-NEXT: shldq $63, %rbp, %r9 +; AVX512F-NEXT: shldq $63, %rax, %r10 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rsi, %rbx +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rsi, %r14 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rsi, %r15 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %r12 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %rcx +; AVX512F-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %rdx +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %rcx +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %r13 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512F-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rax, %r11 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rsi, %rax +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; AVX512F-NEXT: shldq $63, %rbp, %rsi +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $2, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $3, %r13d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $6, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload ; AVX512F-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r15d ; AVX512F-NEXT: vpinsrb $8, %r15d, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r13d -; AVX512F-NEXT: vpinsrb $9, %r13d, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %esi -; AVX512F-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %edi -; AVX512F-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 -; AVX512F-NEXT: shrl %r8d -; AVX512F-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 -; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512F-NEXT: shrl %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512F-NEXT: shrl %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; AVX512F-NEXT: shrl %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %r14d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $10, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $11, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $12, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $13, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $14, %edi, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload ; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: addq $16, %rsp ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-funnel-shifts.ll b/llvm/test/CodeGen/X86/avx512vbmi2-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-funnel-shifts.ll @@ -5,16 +5,12 @@ define <8 x i64> @avx512_funnel_shift_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; X86-LABEL: avx512_funnel_shift_q_512: ; X86: # %bb.0: -; X86-NEXT: vpsllvq {{\.LCPI.*}}, %zmm0, %zmm0 -; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %zmm1, %zmm1 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpshldvq {{\.LCPI.*}}, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_q_512: ; X64: # %bb.0: -; X64-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 -; X64-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpshldvq {{.*}}(%rip), %zmm1, %zmm0 ; X64-NEXT: retq %1 = shl <8 x i64> %a0, %2 = lshr <8 x i64> %a1, @@ -25,9 +21,7 @@ define <8 x i64> @avx512_funnel_shift_q_512_splat(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: avx512_funnel_shift_q_512_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $31, %zmm0, %zmm0 -; CHECK-NEXT: vpsrlq $33, %zmm1, %zmm1 -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshldq $31, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <8 x i64> %a0, %2 = lshr <8 x i64> %a1, @@ -38,16 +32,12 @@ define <16 x i32> @avx512_funnel_shift_d_512(<16 x i32> %a0, <16 x i32> %a1) { ; X86-LABEL: avx512_funnel_shift_d_512: ; X86: # %bb.0: -; X86-NEXT: vpsllvd {{\.LCPI.*}}, %zmm0, %zmm0 -; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %zmm1, %zmm1 -; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpshldvd {{\.LCPI.*}}, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_d_512: ; X64: # %bb.0: -; X64-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 -; X64-NEXT: vpord %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpshldvd {{.*}}(%rip), %zmm1, %zmm0 ; X64-NEXT: retq %1 = shl <16 x i32> %a0, %2 = lshr <16 x i32> %a1, @@ -58,9 +48,7 @@ define <16 x i32> @avx512_funnel_shift_d_512_splat(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: avx512_funnel_shift_d_512_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $15, %zmm0, %zmm0 -; CHECK-NEXT: vpsrld $17, %zmm1, %zmm1 -; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshldd $15, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <16 x i32> %a0, %2 = lshr <16 x i32> %a1, @@ -71,16 +59,12 @@ define <32 x i16> @avx512_funnel_shift_w_512(<32 x i16> %a0, <32 x i16> %a1) { ; X86-LABEL: avx512_funnel_shift_w_512: ; X86: # %bb.0: -; X86-NEXT: vpsllvw {{\.LCPI.*}}, %zmm0, %zmm0 -; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %zmm1, %zmm1 -; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X86-NEXT: vpshldvw {{\.LCPI.*}}, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_w_512: ; X64: # %bb.0: -; X64-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 -; X64-NEXT: vporq %zmm1, %zmm0, %zmm0 +; X64-NEXT: vpshldvw {{.*}}(%rip), %zmm1, %zmm0 ; X64-NEXT: retq %1 = shl <32 x i16> %a0, %2 = lshr <32 x i16> %a1, @@ -91,9 +75,7 @@ define <32 x i16> @avx512_funnel_shift_w_512_splat(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: avx512_funnel_shift_w_512_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 -; CHECK-NEXT: vpsrlw $9, %zmm1, %zmm1 -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <32 x i16> %a0, %2 = lshr <32 x i16> %a1, diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-funnel-shifts.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-funnel-shifts.ll @@ -5,16 +5,12 @@ define <2 x i64> @avx512_funnel_shift_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; X86-LABEL: avx512_funnel_shift_q_128: ; X86: # %bb.0: -; X86-NEXT: vpsllvq {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshldvq {{\.LCPI.*}}, %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_q_128: ; X64: # %bb.0: -; X64-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpshldvq {{.*}}(%rip), %xmm1, %xmm0 ; X64-NEXT: retq %1 = shl <2 x i64> %a0, %2 = lshr <2 x i64> %a1, @@ -25,16 +21,12 @@ define <4 x i64> @avx512_funnel_shift_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; X86-LABEL: avx512_funnel_shift_q_256: ; X86: # %bb.0: -; X86-NEXT: vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0 -; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm1, %ymm1 -; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X86-NEXT: vpshldvq {{\.LCPI.*}}, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_q_256: ; X64: # %bb.0: -; X64-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 -; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshldvq {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: retq %1 = shl <4 x i64> %a0, %2 = lshr <4 x i64> %a1, @@ -45,9 +37,7 @@ define <2 x i64> @avx512_funnel_shift_q_128_splat(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: avx512_funnel_shift_q_128_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $31, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlq $33, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshldq $31, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <2 x i64> %a0, %2 = lshr <2 x i64> %a1, @@ -58,9 +48,7 @@ define <4 x i64> @avx512_funnel_shift_q_256_splat(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: avx512_funnel_shift_q_256_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $31, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlq $33, %ymm1, %ymm1 -; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshldq $31, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <4 x i64> %a0, %2 = lshr <4 x i64> %a1, @@ -71,16 +59,12 @@ define <4 x i32> @avx512_funnel_shift_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: avx512_funnel_shift_d_128: ; X86: # %bb.0: -; X86-NEXT: vpsllvd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshldvd {{\.LCPI.*}}, %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_d_128: ; X64: # %bb.0: -; X64-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpshldvd {{.*}}(%rip), %xmm1, %xmm0 ; X64-NEXT: retq %1 = shl <4 x i32> %a0, %2 = lshr <4 x i32> %a1, @@ -91,16 +75,12 @@ define <8 x i32> @avx512_funnel_shift_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; X86-LABEL: avx512_funnel_shift_d_256: ; X86: # %bb.0: -; X86-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0 -; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm1, %ymm1 -; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X86-NEXT: vpshldvd {{\.LCPI.*}}, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_d_256: ; X64: # %bb.0: -; X64-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 -; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshldvd {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: retq %1 = shl <8 x i32> %a0, %2 = lshr <8 x i32> %a1, @@ -111,9 +91,7 @@ define <4 x i32> @avx512_funnel_shift_d_128_splat(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: avx512_funnel_shift_d_128_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $15, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $17, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshldd $15, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <4 x i32> %a0, %2 = lshr <4 x i32> %a1, @@ -124,9 +102,7 @@ define <8 x i32> @avx512_funnel_shift_d_256_splat(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: avx512_funnel_shift_d_256_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $15, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $17, %ymm1, %ymm1 -; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshldd $15, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <8 x i32> %a0, %2 = lshr <8 x i32> %a1, @@ -137,16 +113,12 @@ define <8 x i16> @avx512_funnel_shift_w_128(<8 x i16> %a0, <8 x i16> %a1) { ; X86-LABEL: avx512_funnel_shift_w_128: ; X86: # %bb.0: -; X86-NEXT: vpsllvw {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %xmm1, %xmm1 -; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshldvw {{\.LCPI.*}}, %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_w_128: ; X64: # %bb.0: -; X64-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpshldvw {{.*}}(%rip), %xmm1, %xmm0 ; X64-NEXT: retq %1 = shl <8 x i16> %a0, %2 = lshr <8 x i16> %a1, @@ -157,16 +129,12 @@ define <16 x i16> @avx512_funnel_shift_w_256(<16 x i16> %a0, <16 x i16> %a1) { ; X86-LABEL: avx512_funnel_shift_w_256: ; X86: # %bb.0: -; X86-NEXT: vpsllvw {{\.LCPI.*}}, %ymm0, %ymm0 -; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %ymm1, %ymm1 -; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X86-NEXT: vpshldvw {{\.LCPI.*}}, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: avx512_funnel_shift_w_256: ; X64: # %bb.0: -; X64-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 -; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpshldvw {{.*}}(%rip), %ymm1, %ymm0 ; X64-NEXT: retq %1 = shl <16 x i16> %a0, %2 = lshr <16 x i16> %a1, @@ -177,9 +145,7 @@ define <8 x i16> @avx512_funnel_shift_w_128_splat(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: avx512_funnel_shift_w_128_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlw $9, %xmm1, %xmm1 -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <8 x i16> %a0, %2 = lshr <8 x i16> %a1, @@ -190,9 +156,7 @@ define <16 x i16> @avx512_funnel_shift_w_256_splat(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: avx512_funnel_shift_w_256_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlw $9, %ymm1, %ymm1 -; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shl <16 x i16> %a0, %2 = lshr <16 x i16> %a1, diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -539,8 +539,8 @@ ; X86-NEXT: shrl $2, %ebx ; X86-NEXT: leal (%ebx,%ebp,4), %ebx ; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 -; X86-NEXT: andl $-1431699456, %ebx # imm = 0xAAAA0000 +; X86-NEXT: andl $1431655765, %ebp # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %ebx # imm = 0xAAAAAAAA ; X86-NEXT: shrl %ebx ; X86-NEXT: leal (%ebx,%ebp,2), %ebx ; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill @@ -927,194 +927,201 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; X64-NEXT: bswapq %rbx -; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F ; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: andq %r13, %r10 +; X64-NEXT: andq %rdi, %r10 ; X64-NEXT: shlq $4, %r10 ; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 ; X64-NEXT: andq %rax, %rbx ; X64-NEXT: shrq $4, %rbx ; X64-NEXT: orq %r10, %rbx -; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 +; X64-NEXT: movabsq $3689348814741910323, %r14 # imm = 0x3333333333333333 ; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: andq %r11, %r10 -; X64-NEXT: movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC -; X64-NEXT: andq %r14, %rbx +; X64-NEXT: andq %r14, %r10 +; X64-NEXT: movabsq $-3689348814741910324, %r11 # imm = 0xCCCCCCCCCCCCCCCC +; X64-NEXT: andq %r11, %rbx +; X64-NEXT: movq %r11, %r12 ; X64-NEXT: shrq $2, %rbx ; X64-NEXT: leaq (%rbx,%r10,4), %r10 -; X64-NEXT: movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000 -; X64-NEXT: andq %r10, %rbx -; X64-NEXT: movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000 -; X64-NEXT: andq %r10, %rdi -; X64-NEXT: shrq %rdi -; X64-NEXT: leaq (%rdi,%rbx,2), %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movabsq $6148914691236517205, %r11 # imm = 0x5555555555555555 +; X64-NEXT: movq %r10, %r13 +; X64-NEXT: andq %r11, %r13 +; X64-NEXT: movabsq $-6148914691236517206, %rbx # imm = 0xAAAAAAAAAAAAAAAA +; X64-NEXT: andq %rbx, %r10 +; X64-NEXT: movq %rbx, %r15 +; X64-NEXT: shrq %r10 +; X64-NEXT: leaq (%r10,%r13,2), %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: bswapq %rbp -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: andq %r13, %rdi -; X64-NEXT: shlq $4, %rdi +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %rdi, %r10 +; X64-NEXT: shlq $4, %r10 ; X64-NEXT: andq %rax, %rbp ; X64-NEXT: shrq $4, %rbp -; X64-NEXT: orq %rdi, %rbp -; X64-NEXT: movq %rbp, %rdi -; X64-NEXT: andq %r11, %rdi -; X64-NEXT: andq %r14, %rbp +; X64-NEXT: orq %r10, %rbp +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %r14, %r10 +; X64-NEXT: movq %r12, %rbx +; X64-NEXT: andq %r12, %rbp ; X64-NEXT: shrq $2, %rbp -; X64-NEXT: leaq (%rbp,%rdi,4), %rbp -; X64-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 +; X64-NEXT: leaq (%rbp,%r10,4), %rbp ; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: andq %rdi, %rbp +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq %rbp ; X64-NEXT: leaq (%rbp,%r10,2), %rbp ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; X64-NEXT: bswapq %rbp ; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %r13, %r10 +; X64-NEXT: andq %rdi, %r10 ; X64-NEXT: shlq $4, %r10 ; X64-NEXT: andq %rax, %rbp -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: shrq $4, %rbp ; X64-NEXT: orq %r10, %rbp ; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %r11, %r10 -; X64-NEXT: andq %r14, %rbp +; X64-NEXT: andq %r14, %r10 +; X64-NEXT: andq %rbx, %rbp +; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: shrq $2, %rbp ; X64-NEXT: leaq (%rbp,%r10,4), %rbp ; X64-NEXT: movq %rbp, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: andq %rdi, %rbp +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: andq %r15, %rbp ; X64-NEXT: shrq %rbp ; X64-NEXT: leaq (%rbp,%r10,2), %rbp ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: bswapq %r10 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: andq %r13, %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: shlq $4, %rax -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: andq %r15, %r10 +; X64-NEXT: movq %r12, %rbx +; X64-NEXT: andq %r12, %r10 ; X64-NEXT: shrq $4, %r10 ; X64-NEXT: orq %rax, %r10 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r10 +; X64-NEXT: andq %r14, %rax +; X64-NEXT: movq %r13, %r15 +; X64-NEXT: andq %r13, %r10 ; X64-NEXT: shrq $2, %r10 ; X64-NEXT: leaq (%r10,%rax,4), %rax ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: andq %rbx, %r10 -; X64-NEXT: movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: andq %r15, %rax +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: movabsq $-6148914691236517206, %r12 # imm = 0xAAAAAAAAAAAAAAAA +; X64-NEXT: andq %r12, %rax ; X64-NEXT: shrq %rax ; X64-NEXT: leaq (%rax,%r10,2), %r10 ; X64-NEXT: bswapq %r9 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: andq %r13, %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %r9 +; X64-NEXT: andq %rbx, %r9 +; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: shrq $4, %r9 ; X64-NEXT: orq %rax, %r9 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r9 +; X64-NEXT: andq %r14, %rax +; X64-NEXT: andq %r15, %r9 ; X64-NEXT: shrq $2, %r9 ; X64-NEXT: leaq (%r9,%rax,4), %rax ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: andq %rbx, %r9 -; X64-NEXT: andq %r15, %rax +; X64-NEXT: andq %r11, %r9 +; X64-NEXT: andq %r12, %rax ; X64-NEXT: shrq %rax ; X64-NEXT: leaq (%rax,%r9,2), %r9 ; X64-NEXT: bswapq %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: andq %r13, %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %r8 +; X64-NEXT: andq %rbx, %r8 ; X64-NEXT: shrq $4, %r8 ; X64-NEXT: orq %rax, %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %r8 +; X64-NEXT: andq %r14, %rax +; X64-NEXT: andq %r15, %r8 ; X64-NEXT: shrq $2, %r8 ; X64-NEXT: leaq (%r8,%rax,4), %rax ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: andq %rbx, %r8 -; X64-NEXT: andq %r15, %rax +; X64-NEXT: andq %r11, %r8 +; X64-NEXT: andq %r12, %rax ; X64-NEXT: shrq %rax ; X64-NEXT: leaq (%rax,%r8,2), %r8 ; X64-NEXT: bswapq %rcx ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: andq %r13, %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: andq %rbx, %rcx ; X64-NEXT: shrq $4, %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %rcx +; X64-NEXT: andq %r14, %rax +; X64-NEXT: andq %r15, %rcx ; X64-NEXT: shrq $2, %rcx ; X64-NEXT: leaq (%rcx,%rax,4), %rax ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: andq %rbx, %rcx -; X64-NEXT: andq %r15, %rax +; X64-NEXT: andq %r11, %rcx +; X64-NEXT: andq %r12, %rax +; X64-NEXT: movq %r12, %rbx ; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%rcx,2), %rcx +; X64-NEXT: leaq (%rax,%rcx,2), %r12 ; X64-NEXT: bswapq %rdx ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: andq %r13, %rax +; X64-NEXT: andq %rdi, %rax ; X64-NEXT: shlq $4, %rax -; X64-NEXT: andq %rdi, %rdx +; X64-NEXT: andq %r13, %rdx ; X64-NEXT: shrq $4, %rdx ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: andq %r11, %rax -; X64-NEXT: andq %r14, %rdx +; X64-NEXT: andq %r14, %rax +; X64-NEXT: andq %r15, %rdx ; X64-NEXT: shrq $2, %rdx ; X64-NEXT: leaq (%rdx,%rax,4), %rax ; X64-NEXT: movq %rax, %rdx -; X64-NEXT: andq %rbx, %rdx -; X64-NEXT: andq %r15, %rax +; X64-NEXT: andq %r11, %rdx +; X64-NEXT: andq %rbx, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: leaq (%rax,%rdx,2), %rcx ; X64-NEXT: bswapq %rsi -; X64-NEXT: andq %rsi, %r13 -; X64-NEXT: andq %rdi, %rsi -; X64-NEXT: shlq $4, %r13 +; X64-NEXT: andq %rsi, %rdi +; X64-NEXT: andq %r13, %rsi +; X64-NEXT: shlq $4, %rdi ; X64-NEXT: shrq $4, %rsi -; X64-NEXT: orq %r13, %rsi -; X64-NEXT: andq %rsi, %r11 -; X64-NEXT: andq %r14, %rsi +; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: andq %rsi, %r14 +; X64-NEXT: andq %r15, %rsi ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: leaq (%rsi,%r11,4), %rdx -; X64-NEXT: andq %rdx, %rbx -; X64-NEXT: andq %r15, %rdx +; X64-NEXT: leaq (%rsi,%r14,4), %rdx +; X64-NEXT: andq %rdx, %r11 +; X64-NEXT: andq %rbx, %rdx ; X64-NEXT: shrq %rdx -; X64-NEXT: leaq (%rdx,%rbx,2), %rdx +; X64-NEXT: leaq (%rdx,%r11,2), %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: shrdq $48, %rdi, %rsi -; X64-NEXT: shrdq $48, %rbp, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: shrdq $48, %rax, %rsi +; X64-NEXT: shrdq $48, %rbp, %rax +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: shrdq $48, %r10, %rbp ; X64-NEXT: shrdq $48, %r9, %r10 ; X64-NEXT: shrdq $48, %r8, %r9 -; X64-NEXT: shrdq $48, %rcx, %r8 -; X64-NEXT: shrdq $48, %rax, %rcx -; X64-NEXT: shrdq $48, %rdx, %rax -; X64-NEXT: movq %rax, 56(%r12) -; X64-NEXT: movq %rcx, 48(%r12) -; X64-NEXT: movq %r8, 40(%r12) -; X64-NEXT: movq %r9, 32(%r12) -; X64-NEXT: movq %r10, 24(%r12) -; X64-NEXT: movq %rbp, 16(%r12) -; X64-NEXT: movq %rdi, 8(%r12) -; X64-NEXT: movq %rsi, (%r12) +; X64-NEXT: shrdq $48, %r12, %r8 +; X64-NEXT: shrdq $48, %rcx, %r12 +; X64-NEXT: shrdq $48, %rdx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rcx, 56(%rax) +; X64-NEXT: movq %r12, 48(%rax) +; X64-NEXT: movq %r8, 40(%rax) +; X64-NEXT: movq %r9, 32(%rax) +; X64-NEXT: movq %r10, 24(%rax) +; X64-NEXT: movq %rbp, 16(%rax) +; X64-NEXT: movq %rdi, 8(%rax) +; X64-NEXT: movq %rsi, (%rax) ; X64-NEXT: shrq $48, %rdx -; X64-NEXT: movw %dx, 64(%r12) -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movw %dx, 64(%rax) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1615,11 +1615,10 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) { ; X32-LABEL: test_i64_140737488289792_mask_lshr_16: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl $32767, %eax # imm = 0x7FFF ; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shll $16, %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: shldl $16, %ecx, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; @@ -1790,11 +1789,10 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) { ; X32-LABEL: test_i64_140737488289792_mask_ashr_16: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl $32767, %eax # imm = 0x7FFF ; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shll $16, %eax -; X32-NEXT: orl %ecx, %eax +; X32-NEXT: shldl $16, %ecx, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -585,34 +585,12 @@ } define i64 @combine_fshl_load_i64(i64* %p) nounwind { -; X86-FAST-LABEL: combine_fshl_load_i64: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl 12(%ecx), %eax -; X86-FAST-NEXT: movl 16(%ecx), %esi -; X86-FAST-NEXT: movl 20(%ecx), %edx -; X86-FAST-NEXT: shldl $24, %esi, %edx -; X86-FAST-NEXT: shrdl $8, %esi, %eax -; X86-FAST-NEXT: popl %esi -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshl_load_i64: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl 20(%eax), %edx -; X86-SLOW-NEXT: movl 12(%eax), %ecx -; X86-SLOW-NEXT: movl 16(%eax), %esi -; X86-SLOW-NEXT: shrl $8, %ecx -; X86-SLOW-NEXT: movl %esi, %eax -; X86-SLOW-NEXT: shll $24, %eax -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: shrl $8, %esi -; X86-SLOW-NEXT: shll $24, %edx -; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: retl +; X86-LABEL: combine_fshl_load_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 13(%ecx), %eax +; X86-NEXT: movl 17(%ecx), %edx +; X86-NEXT: retl ; ; X64-LABEL: combine_fshl_load_i64: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -491,15 +491,15 @@ ; X86-SLOW: # %bb.0: ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %ecx, %esi -; X86-SLOW-NEXT: shll $25, %esi -; X86-SLOW-NEXT: shrl $7, %eax -; X86-SLOW-NEXT: orl %esi, %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: shrl $7, %ecx +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: shll $25, %eax +; X86-SLOW-NEXT: orl %ecx, %eax +; X86-SLOW-NEXT: shrl $7, %esi ; X86-SLOW-NEXT: shll $25, %edx -; X86-SLOW-NEXT: orl %ecx, %edx +; X86-SLOW-NEXT: orl %esi, %edx ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: retl ; @@ -580,35 +580,12 @@ } define i64 @combine_fshr_load_i64(i64* %p) nounwind { -; X86-FAST-LABEL: combine_fshr_load_i64: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movzbl 11(%eax), %ecx -; X86-FAST-NEXT: movl 12(%eax), %esi -; X86-FAST-NEXT: movl 16(%eax), %edx -; X86-FAST-NEXT: shldl $8, %esi, %edx -; X86-FAST-NEXT: movl %esi, %eax -; X86-FAST-NEXT: shll $8, %eax -; X86-FAST-NEXT: orl %ecx, %eax -; X86-FAST-NEXT: popl %esi -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshr_load_i64: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movzbl 11(%eax), %ecx -; X86-SLOW-NEXT: movl 12(%eax), %esi -; X86-SLOW-NEXT: movl 16(%eax), %edx -; X86-SLOW-NEXT: movl %esi, %eax -; X86-SLOW-NEXT: shll $8, %eax -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: shrl $24, %esi -; X86-SLOW-NEXT: shll $8, %edx -; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: popl %esi -; X86-SLOW-NEXT: retl +; X86-LABEL: combine_fshr_load_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 11(%ecx), %eax +; X86-NEXT: movl 15(%ecx), %edx +; X86-NEXT: retl ; ; X64-LABEL: combine_fshr_load_i64: ; X64: # %bb.0: diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll --- a/llvm/test/CodeGen/X86/known-bits.ll +++ b/llvm/test/CodeGen/X86/known-bits.ll @@ -120,7 +120,7 @@ ; X64-NEXT: andq $-1024, %rdi # imm = 0xFC00 ; X64-NEXT: andq $-1024, %rsi # imm = 0xFC00 ; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcl $0, %edx +; X64-NEXT: adcq $0, %rdx ; X64-NEXT: shldq $54, %rsi, %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -10,6 +10,7 @@ ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx @@ -18,303 +19,304 @@ ; CHECK-NEXT: movq %rbx, %rbp ; CHECK-NEXT: andq %rdi, %rbp ; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: movabsq $-1085102592571150096, %r11 # imm = 0xF0F0F0F0F0F0F0F0 -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: movq %r11, %rax +; CHECK-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 +; CHECK-NEXT: andq %rax, %rbx +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: shrq $4, %rbx ; CHECK-NEXT: orq %rbp, %rbx ; CHECK-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 ; CHECK-NEXT: movq %rbx, %r14 ; CHECK-NEXT: andq %r11, %r14 -; CHECK-NEXT: movabsq $-3689348814741910324, %rbp # imm = 0xCCCCCCCCCCCCCCCC -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: movq %rbp, %r15 +; CHECK-NEXT: movabsq $-3689348814741910324, %rax # imm = 0xCCCCCCCCCCCCCCCC +; CHECK-NEXT: andq %rax, %rbx ; CHECK-NEXT: shrq $2, %rbx ; CHECK-NEXT: leaq (%rbx,%r14,4), %r14 -; CHECK-NEXT: movabsq $6148914691230924800, %rbx # imm = 0x5555555555000000 -; CHECK-NEXT: andq %r14, %rbx -; CHECK-NEXT: movabsq $-6148914691247702016, %rbp # imm = 0xAAAAAAAAAA000000 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%rbx,2), %rbx -; CHECK-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 +; CHECK-NEXT: movq %r14, %r15 +; CHECK-NEXT: andq %rbx, %r15 +; CHECK-NEXT: movabsq $-6148914691236517206, %rbp # imm = 0xAAAAAAAAAAAAAAAA +; CHECK-NEXT: andq %rbp, %r14 +; CHECK-NEXT: movq %rbp, %r13 +; CHECK-NEXT: shrq %r14 +; CHECK-NEXT: leaq (%r14,%r15,2), %rbp +; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r10 -; CHECK-NEXT: movq %r10, %rbx -; CHECK-NEXT: andq %rdi, %rbx -; CHECK-NEXT: shlq $4, %rbx -; CHECK-NEXT: andq %rax, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rsi, %r10 ; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: orq %rbx, %r10 -; CHECK-NEXT: movq %r10, %rbx -; CHECK-NEXT: andq %r11, %rbx -; CHECK-NEXT: andq %r15, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rax, %r10 ; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: leaq (%r10,%rbx,4), %rbp -; CHECK-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: movabsq $-6148914691236517206, %r13 # imm = 0xAAAAAAAAAAAAAAAA -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rbp ; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %rax, %rbp -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rax, %r10 +; CHECK-NEXT: movq %rax, %rbp +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: movq %rbp, %rax +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rbp ; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: bswapq %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %rdi, %r10 -; CHECK-NEXT: shlq $4, %r10 -; CHECK-NEXT: andq %r14, %rbp -; CHECK-NEXT: shrq $4, %rbp -; CHECK-NEXT: orq %r10, %rbp -; CHECK-NEXT: movq %rbp, %r10 -; CHECK-NEXT: andq %r11, %r10 -; CHECK-NEXT: andq %r15, %rbp -; CHECK-NEXT: shrq $2, %rbp -; CHECK-NEXT: leaq (%rbp,%r10,4), %rbp -; CHECK-NEXT: movq %rbp, %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: movq %rsi, %rbp +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rax, %r10 +; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rdi, %r14 +; CHECK-NEXT: shlq $4, %r14 +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %r14, %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %r11, %r14 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%r14,4), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: andq %rbx, %r14 +; CHECK-NEXT: andq %r13, %r10 +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: leaq (%r10,%r14,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: bswapq %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andq %rbp, %r10 +; CHECK-NEXT: shrq $4, %r10 +; CHECK-NEXT: orq %rax, %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: shrq $2, %r10 +; CHECK-NEXT: leaq (%r10,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r10 ; CHECK-NEXT: andq %rbx, %r10 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r10,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%r10,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r9 -; CHECK-NEXT: movq %r9, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %r9 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andq %rbp, %r9 ; CHECK-NEXT: shrq $4, %r9 -; CHECK-NEXT: orq %rbp, %r9 -; CHECK-NEXT: movq %r9, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %r9 +; CHECK-NEXT: orq %rax, %r9 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %rsi, %r9 ; CHECK-NEXT: shrq $2, %r9 -; CHECK-NEXT: leaq (%r9,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r9 +; CHECK-NEXT: leaq (%r9,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r9 ; CHECK-NEXT: andq %rbx, %r9 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r9,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%r9,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r8 -; CHECK-NEXT: movq %r8, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %r8 +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andq %rbp, %r8 ; CHECK-NEXT: shrq $4, %r8 -; CHECK-NEXT: orq %rbp, %r8 -; CHECK-NEXT: movq %r8, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %r8 -; CHECK-NEXT: movq %r15, %r9 +; CHECK-NEXT: orq %rax, %r8 +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %rsi, %r8 ; CHECK-NEXT: shrq $2, %r8 -; CHECK-NEXT: leaq (%r8,%rbp,4), %rbp -; CHECK-NEXT: movq %rbp, %r8 +; CHECK-NEXT: leaq (%r8,%rax,4), %rax +; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: andq %rbx, %r8 -; CHECK-NEXT: andq %r13, %rbp -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: leaq (%rbp,%r8,2), %rbp -; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%r8,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andq %rbp, %rcx ; CHECK-NEXT: shrq $4, %rcx -; CHECK-NEXT: orq %rbp, %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r15, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %rsi, %rcx ; CHECK-NEXT: shrq $2, %rcx -; CHECK-NEXT: leaq (%rcx,%rbp,4), %rcx -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: andq %r13, %rcx -; CHECK-NEXT: shrq %rcx -; CHECK-NEXT: leaq (%rcx,%rbp,2), %r15 +; CHECK-NEXT: leaq (%rcx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: andq %rbx, %rcx +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%rcx,2), %r15 ; CHECK-NEXT: bswapq %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %rdi, %rbp -; CHECK-NEXT: shlq $4, %rbp -; CHECK-NEXT: andq %r14, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andq %rbp, %rdx ; CHECK-NEXT: shrq $4, %rdx -; CHECK-NEXT: orq %rbp, %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %r11, %rbp -; CHECK-NEXT: andq %r9, %rdx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %rsi, %rdx ; CHECK-NEXT: shrq $2, %rdx -; CHECK-NEXT: leaq (%rdx,%rbp,4), %rdx -; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: andq %rbx, %rbp -; CHECK-NEXT: andq %r13, %rdx -; CHECK-NEXT: shrq %rdx -; CHECK-NEXT: leaq (%rdx,%rbp,2), %rdx -; CHECK-NEXT: bswapq %rsi -; CHECK-NEXT: andq %rsi, %rdi -; CHECK-NEXT: andq %r14, %rsi +; CHECK-NEXT: leaq (%rdx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: andq %rbx, %rdx +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx +; CHECK-NEXT: bswapq %r12 +; CHECK-NEXT: andq %r12, %rdi +; CHECK-NEXT: andq %rbp, %r12 ; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: shrq $4, %rsi -; CHECK-NEXT: orq %rdi, %rsi -; CHECK-NEXT: andq %rsi, %r11 -; CHECK-NEXT: andq %r9, %rsi -; CHECK-NEXT: shrq $2, %rsi -; CHECK-NEXT: leaq (%rsi,%r11,4), %rsi -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %r13, %rsi -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: leaq (%rsi,%rbx,2), %r13 +; CHECK-NEXT: shrq $4, %r12 +; CHECK-NEXT: orq %rdi, %r12 +; CHECK-NEXT: andq %r12, %r11 +; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: shrq $2, %r12 +; CHECK-NEXT: leaq (%r12,%r11,4), %rax +; CHECK-NEXT: andq %rax, %rbx +; CHECK-NEXT: andq %r13, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: leaq (%rax,%rbx,2), %r12 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rax, %r11 @@ -324,10 +326,10 @@ ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rbp, %rcx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r12, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r13, %rbp ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r14, %r12 +; CHECK-NEXT: shrdq $24, %r14, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rbx, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload @@ -345,7 +347,7 @@ ; CHECK-NEXT: shrdq $24, %r15, %rax ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrdq $24, %rdx, %r15 -; CHECK-NEXT: shrdq $24, %r13, %rdx +; CHECK-NEXT: shrdq $24, %r12, %rdx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-NEXT: movq %rdx, 112(%rax) ; CHECK-NEXT: movq %r15, 104(%rax) @@ -357,16 +359,16 @@ ; CHECK-NEXT: movq %r10, 56(%rax) ; CHECK-NEXT: movq %rbx, 48(%rax) ; CHECK-NEXT: movq %r14, 40(%rax) -; CHECK-NEXT: movq %r12, 32(%rax) +; CHECK-NEXT: movq %r13, 32(%rax) ; CHECK-NEXT: movq %rbp, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) ; CHECK-NEXT: movq %r11, (%rax) -; CHECK-NEXT: movq %r13, %rcx -; CHECK-NEXT: shrq $56, %r13 -; CHECK-NEXT: movb %r13b, 124(%rax) +; CHECK-NEXT: movq %r12, %rcx +; CHECK-NEXT: shrq $56, %r12 +; CHECK-NEXT: movb %r12b, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -261,16 +261,12 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: subq $24, %rsp +; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $33, %rax ; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: sarq $63, %rbx -; X64-NEXT: shlq $31, %rbx -; X64-NEXT: orq %rax, %rbx -; X64-NEXT: sets {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: shldq $31, %rdi, %rbx ; X64-NEXT: shlq $31, %r15 ; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: sarq $63, %r12 @@ -281,10 +277,12 @@ ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: decq %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill +; X64-NEXT: testq %rbx, %rbx +; X64-NEXT: sets %al ; X64-NEXT: testq %r12, %r12 ; X64-NEXT: sets %bpl -; X64-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload +; X64-NEXT: xorb %al, %bpl ; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %r14, %rdx @@ -293,9 +291,9 @@ ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bpl, %al -; X64-NEXT: cmovneq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmovneq (%rsp), %r13 # 8-byte Folded Reload ; X64-NEXT: movq %r13, %rax -; X64-NEXT: addq $24, %rsp +; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -540,169 +538,168 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: subl $60, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shll $31, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shrl %eax -; X86-NEXT: andl $-2147483648, %ebx # imm = 0x80000000 -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shrl %eax -; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 -; X86-NEXT: orl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sets (%esp) # 1-byte Folded Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: shrl %eax -; X86-NEXT: andl $-2147483648, %edi # imm = 0x80000000 -; X86-NEXT: orl %eax, %edi -; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: pushl %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shll $31, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: calll __moddi3 -; X86-NEXT: addl $16, %esp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: shldl $31, %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %edx ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $31, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: shll $31, %ebp +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $31, %ecx +; X86-NEXT: shldl $31, %ebx, %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $31, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: pushl %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shll $31, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shrl $31, %edi +; X86-NEXT: shldl $31, %edx, %edi ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: pushl %ebx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $31, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: sarl $31, %ebp -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: pushl %esi +; X86-NEXT: shll $31, %ebp +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $31, %esi +; X86-NEXT: shldl $31, %ecx, %esi +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %eax +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: sets %bl -; X86-NEXT: xorb (%esp), %bl # 1-byte Folded Reload -; X86-NEXT: pushl %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl %esi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: testl %esi, %esi +; X86-NEXT: sets %cl +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %dl +; X86-NEXT: xorb %cl, %dl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: setne %cl -; X86-NEXT: testb %bl, %cl +; X86-NEXT: testb %dl, %cl ; X86-NEXT: leal -1(%eax), %ecx ; X86-NEXT: cmovel %eax, %ecx -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %al -; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: setne %dl -; X86-NEXT: testb %al, %dl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %edi -; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %dl -; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: sets %cl +; X86-NEXT: xorb %al, %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %dh -; X86-NEXT: testb %dl, %dh +; X86-NEXT: setne %al +; X86-NEXT: testb %cl, %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %edx -; X86-NEXT: cmovel %eax, %edx +; X86-NEXT: leal -1(%eax), %ecx +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bl -; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload +; X86-NEXT: sets %al +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %cl +; X86-NEXT: xorb %al, %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %bh -; X86-NEXT: testb %bl, %bh +; X86-NEXT: setne %al +; X86-NEXT: testb %cl, %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %esi -; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: leal -1(%eax), %ebp +; X86-NEXT: cmovel %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %bl +; X86-NEXT: xorb %al, %bl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: leal -1(%edi), %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: calll __moddi3 +; X86-NEXT: addl $16, %esp +; X86-NEXT: orl %eax, %edx +; X86-NEXT: setne %al +; X86-NEXT: testb %bl, %al +; X86-NEXT: cmovel %edi, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $64, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -308,18 +308,15 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: subq $40, %rsp +; X64-NEXT: subq $24, %rsp +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: movq %rsi, (%rsp) # 8-byte Spill ; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: leaq (%rdi,%rdi), %rax -; X64-NEXT: shrq $33, %rax ; X64-NEXT: movq %rdi, %r12 ; X64-NEXT: sarq $63, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: orq %rax, %r12 -; X64-NEXT: sets {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: shldq $31, %rax, %r12 ; X64-NEXT: shlq $32, %r15 -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: movq %r15, %rdi @@ -332,12 +329,14 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %rbx ; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: testq %r12, %r12 +; X64-NEXT: sets %al ; X64-NEXT: testq %r13, %r13 ; X64-NEXT: sets %r14b -; X64-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload +; X64-NEXT: xorb %al, %r14b ; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r12, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq (%rsp), %rdx # 8-byte Reload ; X64-NEXT: movq %r13, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx @@ -364,7 +363,7 @@ ; X64-NEXT: cmoveq %rcx, %rbx ; X64-NEXT: shrdq $1, %rax, %rbx ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: addq $40, %rsp +; X64-NEXT: addq $24, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -597,82 +596,79 @@ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbp -; X64-NEXT: movq %rbp, %r12 -; X64-NEXT: shrq $33, %r12 -; X64-NEXT: movq %rbp, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %r14 -; X64-NEXT: orq %r14, %r12 +; X64-NEXT: movq %xmm0, %rbx +; X64-NEXT: movq %rbx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %r15, %r12 +; X64-NEXT: shldq $31, %rbx, %r12 ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: sarq $63, %rbx -; X64-NEXT: shlq $31, %rbp -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: movq %rbx, %rdi ; X64-NEXT: movq %r12, %rsi -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: shrq $63, %r14 -; X64-NEXT: xorl %ebx, %r14d -; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %rbx, %rdi ; X64-NEXT: movq %r12, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %r14b, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: shlq $31, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %r14d, %r15d +; X64-NEXT: testb %r15b, %al +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rdx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rdx, %r13 ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: cmovnsq %rcx, %r15 +; X64-NEXT: cmovnsq %rcx, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r15, %r15 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r15 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] -; X64-NEXT: movq %xmm0, %r13 -; X64-NEXT: movq %r13, %rbx -; X64-NEXT: shrq $33, %rbx -; X64-NEXT: movq %r13, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %r14 -; X64-NEXT: orq %r14, %rbx +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: shldq $31, %rbp, %r13 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shlq $31, %r13 -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -680,16 +676,17 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 ; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: shrq $63, %r14 -; X64-NEXT: xorl %ebp, %r14d -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %r14b, %al +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %r14d, %ebx +; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF @@ -721,13 +718,11 @@ ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: paddq %xmm1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %r12 -; X64-NEXT: movq %r12, %rbx -; X64-NEXT: shrq $33, %rbx -; X64-NEXT: movq %r12, %r14 +; X64-NEXT: movq %xmm1, %rbp +; X64-NEXT: movq %rbp, %r14 ; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %r14 -; X64-NEXT: orq %r14, %rbx +; X64-NEXT: movq %r14, %r13 +; X64-NEXT: shldq $31, %rbp, %r13 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; X64-NEXT: # xmm1 = mem[2,3,0,1] ; X64-NEXT: pxor %xmm0, %xmm0 @@ -736,69 +731,68 @@ ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shlq $31, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3 -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r13 +; X64-NEXT: subq $1, %r12 ; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: shrq $63, %r14 -; X64-NEXT: xorl %ebp, %r14d -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: shrq $63, %r14 +; X64-NEXT: xorl %ebx, %r14d ; X64-NEXT: testb %r14b, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r13, %rax +; X64-NEXT: cmovbq %r12, %rax ; X64-NEXT: testq %r15, %r15 -; X64-NEXT: cmovnsq %rcx, %r13 -; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax ; X64-NEXT: cmovnsq %rax, %r15 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r13, %rax +; X64-NEXT: cmovaq %r12, %rax ; X64-NEXT: testq %r15, %r15 -; X64-NEXT: cmovsq %rcx, %r13 +; X64-NEXT: cmovsq %rcx, %r12 ; X64-NEXT: cmpq $-1, %r15 -; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] -; X64-NEXT: movq %xmm0, %r13 -; X64-NEXT: movq %r13, %rbx -; X64-NEXT: shrq $33, %rbx -; X64-NEXT: movq %r13, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: shlq $31, %r14 -; X64-NEXT: orq %r14, %rbx +; X64-NEXT: movq %xmm0, %rbp +; X64-NEXT: movq %rbp, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: shldq $31, %rbp, %r13 ; X64-NEXT: pshufd $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,0,1] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shlq $31, %r13 -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %rbx, %rsi -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shlq $31, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __divti3 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -806,16 +800,17 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r12 ; X64-NEXT: sbbq $0, %r15 -; X64-NEXT: shrq $63, %r14 -; X64-NEXT: xorl %ebp, %r14d -; X64-NEXT: movq %r13, %rdi -; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %rbp, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: callq __modti3 ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %r14b, %al +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: shrq $63, %rbx +; X64-NEXT: xorl %r14d, %ebx +; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF @@ -859,141 +854,129 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $256, %esp # imm = 0x100 -; X86-NEXT: movl 24(%ebp), %ecx -; X86-NEXT: movl 40(%ebp), %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %eax, %eax -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: shll $31, %eax -; X86-NEXT: shrl %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: addl %edx, %edx +; X86-NEXT: adcl %ecx, %ecx +; X86-NEXT: andl $1, %ecx +; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shldl $31, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: negl %edi +; X86-NEXT: shll $31, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %ebx +; X86-NEXT: calll __modti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %esi, %esi +; X86-NEXT: andl $1, %esi +; X86-NEXT: negl %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: shldl $31, %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 36(%ebp), %edi -; X86-NEXT: movl %edi, %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 28(%ebp), %ebx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl 20(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %eax, %eax -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll $31, %eax -; X86-NEXT: shrl %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl 12(%ebp), %esi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: addl %esi, %esi +; X86-NEXT: adcl %ecx, %ecx +; X86-NEXT: andl $1, %ecx +; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shldl $31, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll $31, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: negl %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 28(%ebp), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %eax, %eax -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shll $31, %eax -; X86-NEXT: shrl %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $31, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: negl %esi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edx ; X86-NEXT: pushl %eax +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl 32(%ebp), %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl 16(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %eax, %eax ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shll $31, %eax -; X86-NEXT: shrl %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: adcl %ebx, %ebx +; X86-NEXT: andl $1, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: shldl $31, %ecx, %edi +; X86-NEXT: shll $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $31, %ebx -; X86-NEXT: negl %esi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edx ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl 32(%ebp) ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %esi +; X86-NEXT: pushl 32(%ebp) ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp @@ -1020,7 +1003,7 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -1041,18 +1024,18 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: sets %bl +; X86-NEXT: testl %edi, %edi +; X86-NEXT: sets %al ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bh -; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sets %ah +; X86-NEXT: xorb %al, %ah +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: setne %al -; X86-NEXT: testb %bh, %al +; X86-NEXT: testb %ah, %al ; X86-NEXT: cmovel %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1075,20 +1058,20 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %al ; X86-NEXT: testb %bh, %al ; X86-NEXT: cmovel %edx, %ecx @@ -1100,38 +1083,38 @@ ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovel %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: testl %edx, %edx -; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx ; X86-NEXT: sets %bl ; X86-NEXT: xorb %al, %bl ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl 28(%ebp) ; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl 28(%ebp) +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -1145,21 +1128,21 @@ ; X86-NEXT: setne %al ; X86-NEXT: testb %bl, %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovel %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1176,13 +1159,13 @@ ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %eax, %edi ; X86-NEXT: setne %al ; X86-NEXT: testb %bh, %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -1227,15 +1210,15 @@ ; X86-NEXT: cmovsl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %edx ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, %edi -; X86-NEXT: cmovsl %eax, %edi +; X86-NEXT: movl $0, %esi +; X86-NEXT: cmovsl %eax, %esi ; X86-NEXT: movl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: cmovsl %edx, %eax @@ -1261,28 +1244,28 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: andl %edx, %esi +; X86-NEXT: andl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: cmovel %ecx, %esi +; X86-NEXT: cmovel %ecx, %edi ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: testl %eax, %eax ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: cmpl $-1, %edi ; X86-NEXT: movl $0, %edx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovsl %edx, %esi +; X86-NEXT: cmovsl %edx, %edi ; X86-NEXT: movl $0, %edx ; X86-NEXT: cmovsl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: cmpl $-1, %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %esi ; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: cmovnel %esi, %edi -; X86-NEXT: shldl $31, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovnel %edi, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmpl $-1, %eax ; X86-NEXT: movl $-1, %ecx diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -290,7 +290,6 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shrdl $8, %edx, %eax ; X32-NEXT: shrl $8, %edx ; X32-NEXT: incl %edx ; X32-NEXT: shrdl $8, %edx, %eax diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -173,11 +173,9 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: leaq (%rdi,%rdi), %rsi -; X64-NEXT: shrq $33, %rsi ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $32, %rax -; X64-NEXT: andl $-2147483648, %eax # imm = 0x80000000 -; X64-NEXT: orq %rax, %rsi +; X64-NEXT: shrq $63, %rax +; X64-NEXT: shrdq $33, %rax, %rsi ; X64-NEXT: shlq $32, %rdi ; X64-NEXT: xorl %ebx, %ebx ; X64-NEXT: xorl %ecx, %ecx diff --git a/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll --- a/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll +++ b/llvm/test/CodeGen/X86/x86-64-double-precision-shift-left.ll @@ -49,9 +49,9 @@ define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable { ; CHECK-LABEL: lshift7: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: shlq $7, %rdi ; CHECK-NEXT: shrq $57, %rsi -; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: shlq $7, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: retq entry: %shl = shl i64 %a, 7 @@ -68,9 +68,9 @@ define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable { ; CHECK-LABEL: lshift63: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: shlq $63, %rdi ; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: shlq $63, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: retq entry: %shl = shl i64 %a, 63 diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll --- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -78,9 +78,9 @@ define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 { ; CHECK-LABEL: _Z8lshift12mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: shlq $12, %rdi ; CHECK-NEXT: shrq $52, %rsi -; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: shlq $12, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax ; CHECK-NEXT: retq entry: %shl = shl i64 %a, 12