diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46577,6 +46577,59 @@ return Ret; } +static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, + SDValue And1_L, SDValue And1_R, SDLoc DL, + SelectionDAG &DAG) { + if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) + return SDValue(); + SDValue NotOp = And0_L->getOperand(0); + if (NotOp == And1_R) + std::swap(And1_R, And1_L); + if (NotOp != And1_L) + return SDValue(); + + // (~(NotOp) & And0_R) | (NotOp & And1_R) + // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R + EVT VT = And1_L->getValueType(0); + SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); + SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); + return Xor1; +} + +/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the +/// equivalent `((x ^ y) & m) ^ y)` pattern. +/// This is typically a better representation for targets without a fused +/// "and-not" operation. This function is intended to be called from a +/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. +static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { + // Note that masked-merge variants using XOR or ADD expressions are + // normalized to OR by InstCombine so we only check for OR. + assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); + SDValue N0 = Node->getOperand(0); + if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) + return SDValue(); + SDValue N1 = Node->getOperand(1); + if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) + return SDValue(); + + SDLoc DL(Node); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) + return Result; + return SDValue(); +} + static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -46670,6 +46723,11 @@ return Res; } + // We should fold "masked merge" patterns when `andn` is not available. + if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) + if (SDValue R = foldMaskedMerge(N, DAG)) + return R; + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -8,11 +8,10 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { ; NOBMI-LABEL: masked_merge0: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: andl %edi, %esi -; NOBMI-NEXT: notl %eax -; NOBMI-NEXT: andl %edx, %eax -; NOBMI-NEXT: orl %esi, %eax +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: andl %edi, %eax +; NOBMI-NEXT: xorl %edx, %eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: masked_merge0: @@ -54,14 +53,22 @@ } define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { -; CHECK-LABEL: masked_merge2: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: notb %al -; CHECK-NEXT: andb %sil, %al -; CHECK-NEXT: andb %dil, %sil -; CHECK-NEXT: orb %sil, %al -; CHECK-NEXT: retq +; NOBMI-LABEL: masked_merge2: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorb %sil, %al +; NOBMI-NEXT: andb %dil, %al +; NOBMI-NEXT: xorb %sil, %al +; NOBMI-NEXT: retq +; +; BMI-LABEL: masked_merge2: +; BMI: # %bb.0: +; BMI-NEXT: movl %edi, %eax +; BMI-NEXT: notb %al +; BMI-NEXT: andb %sil, %al +; BMI-NEXT: andb %dil, %sil +; BMI-NEXT: orb %sil, %al +; BMI-NEXT: retq %not = xor i8 %a0, -1 %and0 = and i8 %not, %a1 %and1 = and i8 %a1, %a0 @@ -72,13 +79,12 @@ define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) { ; NOBMI-LABEL: masked_merge3: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movq %rdi, %rax -; NOBMI-NEXT: notq %rsi +; NOBMI-NEXT: movq %rsi, %rax ; NOBMI-NEXT: notq %rdx -; NOBMI-NEXT: andq %rdi, %rsi +; NOBMI-NEXT: xorq %rdx, %rax ; NOBMI-NEXT: notq %rax -; NOBMI-NEXT: andq %rdx, %rax -; NOBMI-NEXT: orq %rsi, %rax +; NOBMI-NEXT: andq %rdi, %rax +; NOBMI-NEXT: xorq %rdx, %rax ; NOBMI-NEXT: retq ; ; BMI-LABEL: masked_merge3: diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -58,11 +58,10 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out32: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl %esi, %eax -; CHECK-NOBMI-NEXT: orl %edi, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out32: @@ -81,11 +80,10 @@ define i64 @out64(i64 %x, i64 %y, i64 %mask) { ; CHECK-NOBMI-LABEL: out64: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movq %rdx, %rax -; CHECK-NOBMI-NEXT: andq %rdx, %rdi -; CHECK-NOBMI-NEXT: notq %rax -; CHECK-NOBMI-NEXT: andq %rsi, %rax -; CHECK-NOBMI-NEXT: orq %rdi, %rax +; CHECK-NOBMI-NEXT: movq %rdi, %rax +; CHECK-NOBMI-NEXT: xorq %rsi, %rax +; CHECK-NOBMI-NEXT: andq %rdx, %rax +; CHECK-NOBMI-NEXT: xorq %rsi, %rax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out64: @@ -628,11 +626,10 @@ define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_varx_42: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl $42, %eax -; CHECK-NOBMI-NEXT: orl %edi, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_42: @@ -673,11 +670,10 @@ define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl %edi, %eax -; CHECK-NOBMI-NEXT: andl $42, %edx -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %edi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_varx_42_invmask: @@ -801,11 +797,10 @@ define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_42_vary: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl $42, %edx -; CHECK-NOBMI-NEXT: andl %esi, %eax -; CHECK-NOBMI-NEXT: orl %edx, %eax +; CHECK-NOBMI-NEXT: movl %esi, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_42_vary: @@ -844,11 +839,10 @@ define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: andl %edx, %esi -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl $42, %eax -; CHECK-NOBMI-NEXT: orl %esi, %eax +; CHECK-NOBMI-NEXT: movl %esi, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl $42, %eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out_constant_42_vary_invmask: diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -107,64 +107,50 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i8: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dil ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: andb %bl, %r8b -; CHECK-BASELINE-NEXT: andb %r11b, %cl -; CHECK-BASELINE-NEXT: andb %r10b, %dl -; CHECK-BASELINE-NEXT: andb %dil, %sil -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: notb %bl -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: andb %r9b, %dil -; CHECK-BASELINE-NEXT: orb %sil, %dil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: orb %r8b, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: orb %cl, %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: orb %dl, %r10b -; CHECK-BASELINE-NEXT: movb %bl, 3(%rax) -; CHECK-BASELINE-NEXT: movb %r11b, 2(%rax) -; CHECK-BASELINE-NEXT: movb %r10b, 1(%rax) -; CHECK-BASELINE-NEXT: movb %dil, (%rax) -; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: xorb %r9b, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %r9b, %sil +; CHECK-BASELINE-NEXT: xorb %r11b, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %r11b, %dl +; CHECK-BASELINE-NEXT: xorb %r10b, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %r10b, %cl +; CHECK-BASELINE-NEXT: xorb %dil, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %dil, %r8b +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax) +; CHECK-BASELINE-NEXT: movb %cl, 2(%rax) +; CHECK-BASELINE-NEXT: movb %dl, 1(%rax) +; CHECK-BASELINE-NEXT: movb %sil, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i8: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dil ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: andb %bl, %r8b -; CHECK-SSE1-NEXT: andb %r11b, %cl -; CHECK-SSE1-NEXT: andb %r10b, %dl -; CHECK-SSE1-NEXT: andb %dil, %sil -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: notb %bl -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: andb %r9b, %dil -; CHECK-SSE1-NEXT: orb %sil, %dil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: orb %r8b, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: orb %cl, %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: orb %dl, %r10b -; CHECK-SSE1-NEXT: movb %bl, 3(%rax) -; CHECK-SSE1-NEXT: movb %r11b, 2(%rax) -; CHECK-SSE1-NEXT: movb %r10b, 1(%rax) -; CHECK-SSE1-NEXT: movb %dil, (%rax) -; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: xorb %r9b, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %r9b, %sil +; CHECK-SSE1-NEXT: xorb %r11b, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %r11b, %dl +; CHECK-SSE1-NEXT: xorb %r10b, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %r10b, %cl +; CHECK-SSE1-NEXT: xorb %dil, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %dil, %r8b +; CHECK-SSE1-NEXT: movb %r8b, 3(%rax) +; CHECK-SSE1-NEXT: movb %cl, 2(%rax) +; CHECK-SSE1-NEXT: movb %dl, 1(%rax) +; CHECK-SSE1-NEXT: movb %sil, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i8: @@ -191,24 +177,20 @@ ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dil ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: andb %r11b, %r8b -; CHECK-BASELINE-NEXT: andb %r10b, %dl -; CHECK-BASELINE-NEXT: andb %dil, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: andb %r9b, %dil -; CHECK-BASELINE-NEXT: orb %sil, %dil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: orb %r8b, %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: orb %dl, %r10b +; CHECK-BASELINE-NEXT: xorb %r9b, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %r9b, %sil +; CHECK-BASELINE-NEXT: xorb %r10b, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %r10b, %dl +; CHECK-BASELINE-NEXT: xorb %dil, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %dil, %r8b ; CHECK-BASELINE-NEXT: movb %cl, 2(%rax) -; CHECK-BASELINE-NEXT: movb %r11b, 3(%rax) -; CHECK-BASELINE-NEXT: movb %r10b, 1(%rax) -; CHECK-BASELINE-NEXT: movb %dil, (%rax) +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax) +; CHECK-BASELINE-NEXT: movb %dl, 1(%rax) +; CHECK-BASELINE-NEXT: movb %sil, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i8_undef: @@ -216,24 +198,20 @@ ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dil ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: andb %r11b, %r8b -; CHECK-SSE1-NEXT: andb %r10b, %dl -; CHECK-SSE1-NEXT: andb %dil, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: andb %r9b, %dil -; CHECK-SSE1-NEXT: orb %sil, %dil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: orb %r8b, %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: orb %dl, %r10b +; CHECK-SSE1-NEXT: xorb %r9b, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %r9b, %sil +; CHECK-SSE1-NEXT: xorb %r10b, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %r10b, %dl +; CHECK-SSE1-NEXT: xorb %dil, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %dil, %r8b ; CHECK-SSE1-NEXT: movb %cl, 2(%rax) -; CHECK-SSE1-NEXT: movb %r11b, 3(%rax) -; CHECK-SSE1-NEXT: movb %r10b, 1(%rax) -; CHECK-SSE1-NEXT: movb %dil, (%rax) +; CHECK-SSE1-NEXT: movb %r8b, 3(%rax) +; CHECK-SSE1-NEXT: movb %dl, 1(%rax) +; CHECK-SSE1-NEXT: movb %sil, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i8_undef: @@ -306,11 +284,10 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %edi, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax ; CHECK-NEXT: retq %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -329,66 +306,54 @@ ; CHECK-BASELINE-NEXT: pushq %rbp ; CHECK-BASELINE-NEXT: pushq %r15 ; CHECK-BASELINE-NEXT: pushq %r14 -; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dil +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: andb %bl, %r9b -; CHECK-BASELINE-NEXT: andb %r15b, %r8b -; CHECK-BASELINE-NEXT: andb %bpl, %cl -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: andb %r11b, %dl -; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: andb %dil, %sil -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: andb %r12b, %r13b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: andb %r14b, %cl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: andb %r10b, %dl -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: notb %bpl -; CHECK-BASELINE-NEXT: notb %r15b -; CHECK-BASELINE-NEXT: notb %bl -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: notb %r14b -; CHECK-BASELINE-NEXT: notb %r12b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: orb %r13b, %r12b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: orb %cl, %r14b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: orb %dl, %r10b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: orb %r9b, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: orb %r8b, %r15b +; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: xorb %r12b, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %r12b, %dl +; CHECK-BASELINE-NEXT: xorb %r15b, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %r15b, %cl +; CHECK-BASELINE-NEXT: xorb %r14b, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %r14b, %r8b +; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %r11b, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %r11b, %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-BASELINE-NEXT: orb %sil, %dil -; CHECK-BASELINE-NEXT: movb %r12b, 7(%rax) -; CHECK-BASELINE-NEXT: movb %r14b, 6(%rax) -; CHECK-BASELINE-NEXT: movb %r10b, 5(%rax) -; CHECK-BASELINE-NEXT: movb %bl, 4(%rax) -; CHECK-BASELINE-NEXT: movb %r15b, 3(%rax) -; CHECK-BASELINE-NEXT: movb %bpl, 2(%rax) -; CHECK-BASELINE-NEXT: movb %r11b, 1(%rax) -; CHECK-BASELINE-NEXT: movb %dil, (%rax) +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %dil, %bl +; CHECK-BASELINE-NEXT: movb %bl, 7(%rax) +; CHECK-BASELINE-NEXT: movb %r11b, 6(%rax) +; CHECK-BASELINE-NEXT: movb %bpl, 5(%rax) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rax) +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax) +; CHECK-BASELINE-NEXT: movb %cl, 2(%rax) +; CHECK-BASELINE-NEXT: movb %dl, 1(%rax) +; CHECK-BASELINE-NEXT: movb %sil, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 -; CHECK-BASELINE-NEXT: popq %r13 ; CHECK-BASELINE-NEXT: popq %r14 ; CHECK-BASELINE-NEXT: popq %r15 ; CHECK-BASELINE-NEXT: popq %rbp @@ -399,66 +364,54 @@ ; CHECK-SSE1-NEXT: pushq %rbp ; CHECK-SSE1-NEXT: pushq %r15 ; CHECK-SSE1-NEXT: pushq %r14 -; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dil +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: andb %bl, %r9b -; CHECK-SSE1-NEXT: andb %r15b, %r8b -; CHECK-SSE1-NEXT: andb %bpl, %cl -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: andb %r11b, %dl -; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: andb %dil, %sil -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: andb %r12b, %r13b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: andb %r14b, %cl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: andb %r10b, %dl -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: notb %bpl -; CHECK-SSE1-NEXT: notb %r15b -; CHECK-SSE1-NEXT: notb %bl -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: notb %r14b -; CHECK-SSE1-NEXT: notb %r12b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: orb %r13b, %r12b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: orb %cl, %r14b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: orb %dl, %r10b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: orb %r9b, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: orb %r8b, %r15b +; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: xorb %r12b, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %r12b, %dl +; CHECK-SSE1-NEXT: xorb %r15b, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %r15b, %cl +; CHECK-SSE1-NEXT: xorb %r14b, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %r14b, %r8b +; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %r11b, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %r11b, %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %r10b, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-SSE1-NEXT: orb %sil, %dil -; CHECK-SSE1-NEXT: movb %r12b, 7(%rax) -; CHECK-SSE1-NEXT: movb %r14b, 6(%rax) -; CHECK-SSE1-NEXT: movb %r10b, 5(%rax) -; CHECK-SSE1-NEXT: movb %bl, 4(%rax) -; CHECK-SSE1-NEXT: movb %r15b, 3(%rax) -; CHECK-SSE1-NEXT: movb %bpl, 2(%rax) -; CHECK-SSE1-NEXT: movb %r11b, 1(%rax) -; CHECK-SSE1-NEXT: movb %dil, (%rax) +; CHECK-SSE1-NEXT: xorb %r10b, %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %dil, %bl +; CHECK-SSE1-NEXT: movb %bl, 7(%rax) +; CHECK-SSE1-NEXT: movb %r11b, 6(%rax) +; CHECK-SSE1-NEXT: movb %bpl, 5(%rax) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rax) +; CHECK-SSE1-NEXT: movb %r8b, 3(%rax) +; CHECK-SSE1-NEXT: movb %cl, 2(%rax) +; CHECK-SSE1-NEXT: movb %dl, 1(%rax) +; CHECK-SSE1-NEXT: movb %sil, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 -; CHECK-SSE1-NEXT: popq %r13 ; CHECK-SSE1-NEXT: popq %r14 ; CHECK-SSE1-NEXT: popq %r15 ; CHECK-SSE1-NEXT: popq %rbp @@ -485,64 +438,50 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: andl %ebx, %esi -; CHECK-BASELINE-NEXT: andl %edi, %r8d -; CHECK-BASELINE-NEXT: andl %r11d, %ecx -; CHECK-BASELINE-NEXT: andl %r10d, %edx -; CHECK-BASELINE-NEXT: notl %r10d -; CHECK-BASELINE-NEXT: notl %r11d -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: notl %ebx -; CHECK-BASELINE-NEXT: andl %r9d, %ebx -; CHECK-BASELINE-NEXT: orl %esi, %ebx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-BASELINE-NEXT: orl %r8d, %edi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: orl %ecx, %r11d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: orl %edx, %r10d -; CHECK-BASELINE-NEXT: movw %bx, (%rax) -; CHECK-BASELINE-NEXT: movw %di, 6(%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 4(%rax) -; CHECK-BASELINE-NEXT: movw %r10w, 2(%rax) -; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) +; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: andl %ebx, %esi -; CHECK-SSE1-NEXT: andl %edi, %r8d -; CHECK-SSE1-NEXT: andl %r11d, %ecx -; CHECK-SSE1-NEXT: andl %r10d, %edx -; CHECK-SSE1-NEXT: notl %r10d -; CHECK-SSE1-NEXT: notl %r11d -; CHECK-SSE1-NEXT: notl %edi -; CHECK-SSE1-NEXT: notl %ebx -; CHECK-SSE1-NEXT: andl %r9d, %ebx -; CHECK-SSE1-NEXT: orl %esi, %ebx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-SSE1-NEXT: orl %r8d, %edi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: orl %ecx, %r11d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: orl %edx, %r10d -; CHECK-SSE1-NEXT: movw %bx, (%rax) -; CHECK-SSE1-NEXT: movw %di, 6(%rax) -; CHECK-SSE1-NEXT: movw %r11w, 4(%rax) -; CHECK-SSE1-NEXT: movw %r10w, 2(%rax) -; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) +; CHECK-SSE1-NEXT: movw %cx, 4(%rax) +; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16: @@ -567,51 +506,43 @@ ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: andl %edi, %esi -; CHECK-BASELINE-NEXT: andl %r11d, %r8d -; CHECK-BASELINE-NEXT: andl %r10d, %edx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: notl %r10d -; CHECK-BASELINE-NEXT: notl %r11d -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: andl %r9d, %edi -; CHECK-BASELINE-NEXT: orl %esi, %edi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: orl %r8d, %r11d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: orl %edx, %r10d +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %di, (%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %r10w, 2(%rax) +; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: andl %edi, %esi -; CHECK-SSE1-NEXT: andl %r11d, %r8d -; CHECK-SSE1-NEXT: andl %r10d, %edx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: notl %r10d -; CHECK-SSE1-NEXT: notl %r11d -; CHECK-SSE1-NEXT: notl %edi -; CHECK-SSE1-NEXT: andl %r9d, %edi -; CHECK-SSE1-NEXT: orl %esi, %edi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: orl %r8d, %r11d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: orl %edx, %r10d +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: xorl %edi, %edx +; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: xorl %r10d, %r8d ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %di, (%rax) -; CHECK-SSE1-NEXT: movw %r11w, 6(%rax) -; CHECK-SSE1-NEXT: movw %r10w, 2(%rax) +; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) +; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16_undef: @@ -635,30 +566,26 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i32: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: andl %r8d, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notl %eax -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andl %ecx, %r9d -; CHECK-BASELINE-NEXT: orl %esi, %r9d -; CHECK-BASELINE-NEXT: andl %edx, %eax -; CHECK-BASELINE-NEXT: orl %edi, %eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i32: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: andl %r8d, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notl %eax -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andl %ecx, %r9d -; CHECK-SSE1-NEXT: orl %esi, %r9d -; CHECK-SSE1-NEXT: andl %edx, %eax -; CHECK-SSE1-NEXT: orl %edi, %eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i32: @@ -682,11 +609,10 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: notq %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: orq %rdi, %rax +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: xorq %rsi, %rax +; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: retq %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -708,120 +634,105 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dil -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movl %edx, %r11d +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %cl, %sil -; CHECK-BASELINE-NEXT: notb %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %bl, %sil +; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: orb %sil, %cl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %dl, %sil -; CHECK-BASELINE-NEXT: notb %dl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: orb %sil, %dl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %bl, %sil -; CHECK-BASELINE-NEXT: notb %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: orb %sil, %bl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r13b, %sil -; CHECK-BASELINE-NEXT: notb %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: orb %sil, %r13b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r12b, %sil -; CHECK-BASELINE-NEXT: notb %r12b +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorb %r12b, %r9b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-BASELINE-NEXT: xorb %r12b, %r9b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %bpl, %r12b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: orb %sil, %r12b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r15b, %sil -; CHECK-BASELINE-NEXT: notb %r15b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: orb %sil, %r15b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r14b, %sil -; CHECK-BASELINE-NEXT: notb %r14b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: orb %sil, %r14b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %bpl, %sil -; CHECK-BASELINE-NEXT: notb %bpl +; CHECK-BASELINE-NEXT: xorb %bpl, %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %r14b, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: orb %sil, %bpl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r11b, %sil -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: orb %sil, %r11b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %r10b, %sil -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: orb %sil, %r10b +; CHECK-BASELINE-NEXT: xorb %r14b, %bpl ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: andb %dil, %sil -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-BASELINE-NEXT: orb %sil, %dil -; CHECK-BASELINE-NEXT: movb %cl, 15(%rax) -; CHECK-BASELINE-NEXT: movb %dl, 14(%rax) -; CHECK-BASELINE-NEXT: movb %bl, 13(%rax) -; CHECK-BASELINE-NEXT: movb %r13b, 12(%rax) -; CHECK-BASELINE-NEXT: movb %r12b, 11(%rax) -; CHECK-BASELINE-NEXT: movb %r15b, 10(%rax) -; CHECK-BASELINE-NEXT: movb %r14b, 9(%rax) -; CHECK-BASELINE-NEXT: movb %bpl, 8(%rax) -; CHECK-BASELINE-NEXT: movb %r11b, 7(%rax) -; CHECK-BASELINE-NEXT: movb %r10b, 6(%rax) -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: andb %cl, %r9b -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: orb %r9b, %cl -; CHECK-BASELINE-NEXT: movb %dil, 5(%rax) -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: andb %dl, %r8b -; CHECK-BASELINE-NEXT: notb %dl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: orb %r8b, %dl -; CHECK-BASELINE-NEXT: movb %cl, 4(%rax) -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %sil -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: orb %sil, %cl -; CHECK-BASELINE-NEXT: movb %dl, 3(%rax) +; CHECK-BASELINE-NEXT: xorb %r15b, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %r15b, %sil ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-BASELINE-NEXT: andb %dl, %sil -; CHECK-BASELINE-NEXT: notb %dl +; CHECK-BASELINE-NEXT: xorb %r13b, %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: orb %sil, %dl -; CHECK-BASELINE-NEXT: movb %cl, 2(%rax) +; CHECK-BASELINE-NEXT: xorb %r13b, %dl ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %sil -; CHECK-BASELINE-NEXT: notb %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: orb %sil, %cl -; CHECK-BASELINE-NEXT: movb %dl, 1(%rax) -; CHECK-BASELINE-NEXT: movb %cl, (%rax) +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: movb %r10b, 15(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 9(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 8(%rdi) +; CHECK-BASELINE-NEXT: movb %sil, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r12b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 3(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movb %al, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -838,120 +749,105 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dil -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movl %edx, %r11d +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %cl, %sil -; CHECK-SSE1-NEXT: notb %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %bl, %sil +; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: orb %sil, %cl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %dl, %sil -; CHECK-SSE1-NEXT: notb %dl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: orb %sil, %dl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %bl, %sil -; CHECK-SSE1-NEXT: notb %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: orb %sil, %bl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r13b, %sil -; CHECK-SSE1-NEXT: notb %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: orb %sil, %r13b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r12b, %sil -; CHECK-SSE1-NEXT: notb %r12b +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorb %r12b, %r9b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-SSE1-NEXT: xorb %r12b, %r9b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %bpl, %r12b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: orb %sil, %r12b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r15b, %sil -; CHECK-SSE1-NEXT: notb %r15b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: orb %sil, %r15b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r14b, %sil -; CHECK-SSE1-NEXT: notb %r14b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: orb %sil, %r14b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %bpl, %sil -; CHECK-SSE1-NEXT: notb %bpl +; CHECK-SSE1-NEXT: xorb %bpl, %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %r14b, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: orb %sil, %bpl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r11b, %sil -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: orb %sil, %r11b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %r10b, %sil -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: orb %sil, %r10b +; CHECK-SSE1-NEXT: xorb %r14b, %bpl ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: andb %dil, %sil -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-SSE1-NEXT: orb %sil, %dil -; CHECK-SSE1-NEXT: movb %cl, 15(%rax) -; CHECK-SSE1-NEXT: movb %dl, 14(%rax) -; CHECK-SSE1-NEXT: movb %bl, 13(%rax) -; CHECK-SSE1-NEXT: movb %r13b, 12(%rax) -; CHECK-SSE1-NEXT: movb %r12b, 11(%rax) -; CHECK-SSE1-NEXT: movb %r15b, 10(%rax) -; CHECK-SSE1-NEXT: movb %r14b, 9(%rax) -; CHECK-SSE1-NEXT: movb %bpl, 8(%rax) -; CHECK-SSE1-NEXT: movb %r11b, 7(%rax) -; CHECK-SSE1-NEXT: movb %r10b, 6(%rax) -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: andb %cl, %r9b -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: orb %r9b, %cl -; CHECK-SSE1-NEXT: movb %dil, 5(%rax) -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: andb %dl, %r8b -; CHECK-SSE1-NEXT: notb %dl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: orb %r8b, %dl -; CHECK-SSE1-NEXT: movb %cl, 4(%rax) -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %sil -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: orb %sil, %cl -; CHECK-SSE1-NEXT: movb %dl, 3(%rax) +; CHECK-SSE1-NEXT: xorb %r15b, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %r15b, %sil ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-SSE1-NEXT: andb %dl, %sil -; CHECK-SSE1-NEXT: notb %dl +; CHECK-SSE1-NEXT: xorb %r13b, %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: orb %sil, %dl -; CHECK-SSE1-NEXT: movb %cl, 2(%rax) +; CHECK-SSE1-NEXT: xorb %r13b, %dl ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %sil -; CHECK-SSE1-NEXT: notb %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: orb %sil, %cl -; CHECK-SSE1-NEXT: movb %dl, 1(%rax) -; CHECK-SSE1-NEXT: movb %cl, (%rax) +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: movb %r10b, 15(%rdi) +; CHECK-SSE1-NEXT: movb %al, 14(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 13(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 12(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 11(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 9(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 8(%rdi) +; CHECK-SSE1-NEXT: movb %sil, 7(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r12b, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movb %al, 3(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movb %al, 2(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movb %al, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -982,124 +878,116 @@ ; CHECK-BASELINE-LABEL: out_v8i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r15 ; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: andw %r14w, %bx -; CHECK-BASELINE-NEXT: notl %r14d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r14w -; CHECK-BASELINE-NEXT: orl %ebx, %r14d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: andw %r11w, %di -; CHECK-BASELINE-NEXT: notl %r11d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: orl %edi, %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: andw %r10w, %di -; CHECK-BASELINE-NEXT: notl %r10d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: orl %edi, %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: andl %ebx, %r9d -; CHECK-BASELINE-NEXT: notl %ebx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: orl %r9d, %ebx -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: andl %edi, %r8d -; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: xorl %ebp, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %ebp, %esi +; CHECK-BASELINE-NEXT: xorl %ebx, %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: xorl %ebx, %edx +; CHECK-BASELINE-NEXT: xorl %edi, %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: xorl %edi, %ecx +; CHECK-BASELINE-NEXT: xorl %r12d, %r8d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: xorl %r12d, %r8d +; CHECK-BASELINE-NEXT: xorl %r15d, %r9d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-BASELINE-NEXT: xorl %r15d, %r9d +; CHECK-BASELINE-NEXT: movl %r14d, %edi +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %di ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-BASELINE-NEXT: orl %r8d, %edi -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: andl %ebp, %ecx -; CHECK-BASELINE-NEXT: notl %ebp +; CHECK-BASELINE-NEXT: xorl %r14d, %edi +; CHECK-BASELINE-NEXT: movl %r11d, %ebx +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: xorl %r11d, %ebx +; CHECK-BASELINE-NEXT: movl %r10d, %ebp +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bp ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-BASELINE-NEXT: orl %ecx, %ebp -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-BASELINE-NEXT: andl %ecx, %edx -; CHECK-BASELINE-NEXT: notl %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: orl %edx, %ecx -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-BASELINE-NEXT: andl %edx, %esi -; CHECK-BASELINE-NEXT: notl %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: orl %esi, %edx -; CHECK-BASELINE-NEXT: movw %r14w, 14(%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) -; CHECK-BASELINE-NEXT: movw %r10w, 10(%rax) -; CHECK-BASELINE-NEXT: movw %bx, 8(%rax) -; CHECK-BASELINE-NEXT: movw %di, 6(%rax) -; CHECK-BASELINE-NEXT: movw %bp, 4(%rax) -; CHECK-BASELINE-NEXT: movw %cx, 2(%rax) -; CHECK-BASELINE-NEXT: movw %dx, (%rax) +; CHECK-BASELINE-NEXT: xorl %r10d, %ebp +; CHECK-BASELINE-NEXT: movw %bp, 14(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rax) +; CHECK-BASELINE-NEXT: movw %di, 10(%rax) +; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) +; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) +; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %r15 ; CHECK-BASELINE-NEXT: popq %rbp ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v8i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r15 ; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: andw %r14w, %bx -; CHECK-SSE1-NEXT: notl %r14d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r14w -; CHECK-SSE1-NEXT: orl %ebx, %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: andw %r11w, %di -; CHECK-SSE1-NEXT: notl %r11d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: orl %edi, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: andw %r10w, %di -; CHECK-SSE1-NEXT: notl %r10d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: orl %edi, %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: andl %ebx, %r9d -; CHECK-SSE1-NEXT: notl %ebx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: orl %r9d, %ebx -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: andl %edi, %r8d -; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: xorl %ebp, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %ebp, %esi +; CHECK-SSE1-NEXT: xorl %ebx, %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: xorl %ebx, %edx +; CHECK-SSE1-NEXT: xorl %edi, %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: xorl %edi, %ecx +; CHECK-SSE1-NEXT: xorl %r12d, %r8d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: xorl %r12d, %r8d +; CHECK-SSE1-NEXT: xorl %r15d, %r9d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-SSE1-NEXT: xorl %r15d, %r9d +; CHECK-SSE1-NEXT: movl %r14d, %edi +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %di ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di -; CHECK-SSE1-NEXT: orl %r8d, %edi -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: andl %ebp, %ecx -; CHECK-SSE1-NEXT: notl %ebp +; CHECK-SSE1-NEXT: xorl %r14d, %edi +; CHECK-SSE1-NEXT: movl %r11d, %ebx +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: xorl %r11d, %ebx +; CHECK-SSE1-NEXT: movl %r10d, %ebp +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bp ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp -; CHECK-SSE1-NEXT: orl %ecx, %ebp -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-SSE1-NEXT: andl %ecx, %edx -; CHECK-SSE1-NEXT: notl %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: orl %edx, %ecx -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-SSE1-NEXT: andl %edx, %esi -; CHECK-SSE1-NEXT: notl %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: orl %esi, %edx -; CHECK-SSE1-NEXT: movw %r14w, 14(%rax) -; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) -; CHECK-SSE1-NEXT: movw %r10w, 10(%rax) -; CHECK-SSE1-NEXT: movw %bx, 8(%rax) -; CHECK-SSE1-NEXT: movw %di, 6(%rax) -; CHECK-SSE1-NEXT: movw %bp, 4(%rax) -; CHECK-SSE1-NEXT: movw %cx, 2(%rax) -; CHECK-SSE1-NEXT: movw %dx, (%rax) +; CHECK-SSE1-NEXT: xorl %r10d, %ebp +; CHECK-SSE1-NEXT: movw %bp, 14(%rax) +; CHECK-SSE1-NEXT: movw %bx, 12(%rax) +; CHECK-SSE1-NEXT: movw %di, 10(%rax) +; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) +; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) +; CHECK-SSE1-NEXT: movw %cx, 4(%rax) +; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %r15 ; CHECK-SSE1-NEXT: popq %rbp ; CHECK-SSE1-NEXT: retq ; @@ -1124,37 +1012,31 @@ define <4 x i32> @out_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i32: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl (%rcx), %r8d -; CHECK-BASELINE-NEXT: movl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: movl 8(%rcx), %edi -; CHECK-BASELINE-NEXT: movl 12(%rcx), %ecx -; CHECK-BASELINE-NEXT: movl 12(%rsi), %r10d -; CHECK-BASELINE-NEXT: andl %ecx, %r10d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d -; CHECK-BASELINE-NEXT: andl %edi, %r11d -; CHECK-BASELINE-NEXT: movl 4(%rsi), %ebx -; CHECK-BASELINE-NEXT: andl %r9d, %ebx -; CHECK-BASELINE-NEXT: movl (%rsi), %esi -; CHECK-BASELINE-NEXT: andl %r8d, %esi -; CHECK-BASELINE-NEXT: notl %r8d -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: notl %ecx -; CHECK-BASELINE-NEXT: andl 12(%rdx), %ecx -; CHECK-BASELINE-NEXT: orl %r10d, %ecx -; CHECK-BASELINE-NEXT: andl 8(%rdx), %edi -; CHECK-BASELINE-NEXT: orl %r11d, %edi -; CHECK-BASELINE-NEXT: andl 4(%rdx), %r9d -; CHECK-BASELINE-NEXT: orl %ebx, %r9d -; CHECK-BASELINE-NEXT: andl (%rdx), %r8d -; CHECK-BASELINE-NEXT: orl %esi, %r8d -; CHECK-BASELINE-NEXT: movl %ecx, 12(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 8(%rax) -; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %r8d, (%rax) -; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d +; CHECK-BASELINE-NEXT: movl (%rdx), %edi +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d +; CHECK-BASELINE-NEXT: movl (%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %edi, %r11d +; CHECK-BASELINE-NEXT: andl (%rcx), %r11d +; CHECK-BASELINE-NEXT: xorl %edi, %r11d +; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: movl 8(%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: andl 8(%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) +; CHECK-BASELINE-NEXT: movl %edx, 8(%rax) +; CHECK-BASELINE-NEXT: movl %edi, 4(%rax) +; CHECK-BASELINE-NEXT: movl %r11d, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32: @@ -1197,30 +1079,27 @@ ; CHECK-BASELINE-LABEL: out_v4i32_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r8d -; CHECK-BASELINE-NEXT: movl (%rcx), %r9d -; CHECK-BASELINE-NEXT: movl 4(%rcx), %r10d -; CHECK-BASELINE-NEXT: movl 12(%rcx), %edi -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r8d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %ecx -; CHECK-BASELINE-NEXT: andl %edi, %ecx -; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d -; CHECK-BASELINE-NEXT: andl %r10d, %r11d -; CHECK-BASELINE-NEXT: movl (%rsi), %esi -; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: notl %r10d -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: andl 12(%rdx), %edi -; CHECK-BASELINE-NEXT: orl %ecx, %edi -; CHECK-BASELINE-NEXT: andl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: orl %r11d, %r10d -; CHECK-BASELINE-NEXT: andl (%rdx), %r9d -; CHECK-BASELINE-NEXT: orl %esi, %r9d -; CHECK-BASELINE-NEXT: movl %r8d, 8(%rax) -; CHECK-BASELINE-NEXT: movl %edi, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r10d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %r9d, (%rax) +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r9d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl (%rdx), %edi +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r9d +; CHECK-BASELINE-NEXT: movl (%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: andl (%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %edi, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: andl 4(%rcx), %edi +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: movl %r9d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) +; CHECK-BASELINE-NEXT: movl %edi, 4(%rax) +; CHECK-BASELINE-NEXT: movl %edx, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32_undef: @@ -1262,30 +1141,26 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i64: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movq %r8, %rax +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: xorq %rdx, %rax +; CHECK-BASELINE-NEXT: andq %r8, %rax +; CHECK-BASELINE-NEXT: xorq %rdx, %rax +; CHECK-BASELINE-NEXT: xorq %rcx, %rsi ; CHECK-BASELINE-NEXT: andq %r9, %rsi -; CHECK-BASELINE-NEXT: andq %r8, %rdi -; CHECK-BASELINE-NEXT: notq %rax -; CHECK-BASELINE-NEXT: notq %r9 -; CHECK-BASELINE-NEXT: andq %rcx, %r9 -; CHECK-BASELINE-NEXT: orq %rsi, %r9 -; CHECK-BASELINE-NEXT: andq %rdx, %rax -; CHECK-BASELINE-NEXT: orq %rdi, %rax -; CHECK-BASELINE-NEXT: movq %r9, %rdx +; CHECK-BASELINE-NEXT: xorq %rcx, %rsi +; CHECK-BASELINE-NEXT: movq %rsi, %rdx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i64: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %r8, %rax +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: xorq %rdx, %rax +; CHECK-SSE1-NEXT: andq %r8, %rax +; CHECK-SSE1-NEXT: xorq %rdx, %rax +; CHECK-SSE1-NEXT: xorq %rcx, %rsi ; CHECK-SSE1-NEXT: andq %r9, %rsi -; CHECK-SSE1-NEXT: andq %r8, %rdi -; CHECK-SSE1-NEXT: notq %rax -; CHECK-SSE1-NEXT: notq %r9 -; CHECK-SSE1-NEXT: andq %rcx, %r9 -; CHECK-SSE1-NEXT: orq %rsi, %r9 -; CHECK-SSE1-NEXT: andq %rdx, %rax -; CHECK-SSE1-NEXT: orq %rdi, %rax -; CHECK-SSE1-NEXT: movq %r9, %rdx +; CHECK-SSE1-NEXT: xorq %rcx, %rsi +; CHECK-SSE1-NEXT: movq %rsi, %rdx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i64: @@ -1319,286 +1194,257 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r15 -; CHECK-BASELINE-NEXT: movq %rsi, %r12 -; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-BASELINE-NEXT: movb 16(%rcx), %al +; CHECK-BASELINE-NEXT: movq %rcx, %r10 +; CHECK-BASELINE-NEXT: movq %rdx, %r8 +; CHECK-BASELINE-NEXT: movq %rsi, %r9 +; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movb 15(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 17(%rcx), %al +; CHECK-BASELINE-NEXT: movb 14(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 18(%rcx), %al +; CHECK-BASELINE-NEXT: movb 13(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 19(%rcx), %al +; CHECK-BASELINE-NEXT: movb 12(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 20(%rcx), %al +; CHECK-BASELINE-NEXT: movb 11(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 21(%rcx), %r14b -; CHECK-BASELINE-NEXT: movb 22(%rcx), %r9b -; CHECK-BASELINE-NEXT: movb 23(%rcx), %r10b -; CHECK-BASELINE-NEXT: movb 24(%rcx), %r11b -; CHECK-BASELINE-NEXT: movb 25(%rcx), %bpl -; CHECK-BASELINE-NEXT: movb 26(%rcx), %r13b -; CHECK-BASELINE-NEXT: movb 27(%rcx), %r8b -; CHECK-BASELINE-NEXT: movb 28(%rcx), %dil -; CHECK-BASELINE-NEXT: movb 29(%rcx), %sil -; CHECK-BASELINE-NEXT: movb 30(%rcx), %bl -; CHECK-BASELINE-NEXT: movb 31(%rcx), %al -; CHECK-BASELINE-NEXT: movb 31(%r12), %cl -; CHECK-BASELINE-NEXT: andb %al, %cl -; CHECK-BASELINE-NEXT: notb %al -; CHECK-BASELINE-NEXT: andb 31(%rdx), %al -; CHECK-BASELINE-NEXT: orb %cl, %al +; CHECK-BASELINE-NEXT: movb 10(%rdx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 30(%r12), %al -; CHECK-BASELINE-NEXT: andb %bl, %al -; CHECK-BASELINE-NEXT: notb %bl -; CHECK-BASELINE-NEXT: andb 30(%rdx), %bl -; CHECK-BASELINE-NEXT: orb %al, %bl +; CHECK-BASELINE-NEXT: movb 9(%rdx), %bpl +; CHECK-BASELINE-NEXT: movb 8(%rdx), %r14b +; CHECK-BASELINE-NEXT: movb 7(%rdx), %r15b +; CHECK-BASELINE-NEXT: movb 6(%rdx), %r12b +; CHECK-BASELINE-NEXT: movb 5(%rdx), %sil +; CHECK-BASELINE-NEXT: movb 4(%rdx), %r13b +; CHECK-BASELINE-NEXT: movb 3(%rdx), %dl +; CHECK-BASELINE-NEXT: movb 2(%r8), %dil +; CHECK-BASELINE-NEXT: movb (%r8), %al +; CHECK-BASELINE-NEXT: movb 1(%r8), %cl +; CHECK-BASELINE-NEXT: movb (%r9), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb (%r10), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl ; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 29(%r12), %al -; CHECK-BASELINE-NEXT: andb %sil, %al -; CHECK-BASELINE-NEXT: notb %sil -; CHECK-BASELINE-NEXT: andb 29(%rdx), %sil -; CHECK-BASELINE-NEXT: orb %al, %sil -; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 28(%r12), %al -; CHECK-BASELINE-NEXT: andb %dil, %al -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: andb 28(%rdx), %dil -; CHECK-BASELINE-NEXT: orb %al, %dil -; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 27(%r12), %al -; CHECK-BASELINE-NEXT: andb %r8b, %al -; CHECK-BASELINE-NEXT: notb %r8b -; CHECK-BASELINE-NEXT: andb 27(%rdx), %r8b -; CHECK-BASELINE-NEXT: orb %al, %r8b -; CHECK-BASELINE-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 26(%r12), %al -; CHECK-BASELINE-NEXT: andb %r13b, %al -; CHECK-BASELINE-NEXT: notb %r13b -; CHECK-BASELINE-NEXT: andb 26(%rdx), %r13b -; CHECK-BASELINE-NEXT: orb %al, %r13b -; CHECK-BASELINE-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 25(%r12), %al -; CHECK-BASELINE-NEXT: andb %bpl, %al -; CHECK-BASELINE-NEXT: notb %bpl -; CHECK-BASELINE-NEXT: andb 25(%rdx), %bpl -; CHECK-BASELINE-NEXT: orb %al, %bpl -; CHECK-BASELINE-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 24(%r12), %al -; CHECK-BASELINE-NEXT: andb %r11b, %al -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: andb 24(%rdx), %r11b -; CHECK-BASELINE-NEXT: orb %al, %r11b -; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 23(%r12), %al -; CHECK-BASELINE-NEXT: andb %r10b, %al -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: andb 23(%rdx), %r10b -; CHECK-BASELINE-NEXT: orb %al, %r10b -; CHECK-BASELINE-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 22(%r12), %al -; CHECK-BASELINE-NEXT: andb %r9b, %al -; CHECK-BASELINE-NEXT: notb %r9b -; CHECK-BASELINE-NEXT: andb 22(%rdx), %r9b -; CHECK-BASELINE-NEXT: orb %al, %r9b -; CHECK-BASELINE-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 21(%r12), %al -; CHECK-BASELINE-NEXT: andb %r14b, %al -; CHECK-BASELINE-NEXT: notb %r14b -; CHECK-BASELINE-NEXT: andb 21(%rdx), %r14b -; CHECK-BASELINE-NEXT: orb %al, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 20(%r12), %al +; CHECK-BASELINE-NEXT: movb 1(%r9), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 1(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 2(%r9), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 2(%r10), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 3(%r9), %al +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: andb 3(%r10), %al +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 4(%r9), %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: andb 4(%r10), %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 5(%r9), %al +; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: andb 5(%r10), %al +; CHECK-BASELINE-NEXT: xorb %sil, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 6(%r9), %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: andb 6(%r10), %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 7(%r9), %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: andb 7(%r10), %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 8(%r9), %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: andb 8(%r10), %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 9(%r9), %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: andb 9(%r10), %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 10(%r9), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 20(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 19(%r12), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 10(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 11(%r9), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 19(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 18(%r12), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 11(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 12(%r9), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 18(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 17(%r12), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 12(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 13(%r9), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: movq %rdx, %rbx -; CHECK-BASELINE-NEXT: andb 17(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 16(%r12), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 13(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 14(%r9), %al +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 14(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 15(%r9), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 16(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 15(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 16(%r8), %al +; CHECK-BASELINE-NEXT: movb 16(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 16(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 15(%r15), %cl -; CHECK-BASELINE-NEXT: movb 15(%r12), %al -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 15(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl +; CHECK-BASELINE-NEXT: movb 17(%r8), %al +; CHECK-BASELINE-NEXT: movb 17(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 17(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 14(%r15), %cl -; CHECK-BASELINE-NEXT: movb 14(%r12), %al -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 14(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl +; CHECK-BASELINE-NEXT: movb 18(%r8), %al +; CHECK-BASELINE-NEXT: movb 18(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 18(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 13(%r15), %cl -; CHECK-BASELINE-NEXT: movb 13(%r12), %al -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 13(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl +; CHECK-BASELINE-NEXT: movb 19(%r8), %al +; CHECK-BASELINE-NEXT: movb 19(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 19(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 12(%r15), %cl -; CHECK-BASELINE-NEXT: movb 12(%r12), %al -; CHECK-BASELINE-NEXT: andb %cl, %al -; CHECK-BASELINE-NEXT: notb %cl -; CHECK-BASELINE-NEXT: andb 12(%rdx), %cl -; CHECK-BASELINE-NEXT: orb %al, %cl +; CHECK-BASELINE-NEXT: movb 20(%r8), %al +; CHECK-BASELINE-NEXT: movb 20(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 20(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 11(%r15), %r13b -; CHECK-BASELINE-NEXT: movb 11(%r12), %al -; CHECK-BASELINE-NEXT: andb %r13b, %al -; CHECK-BASELINE-NEXT: notb %r13b -; CHECK-BASELINE-NEXT: andb 11(%rdx), %r13b -; CHECK-BASELINE-NEXT: orb %al, %r13b -; CHECK-BASELINE-NEXT: movb 10(%r15), %r14b -; CHECK-BASELINE-NEXT: movb 10(%r12), %al -; CHECK-BASELINE-NEXT: andb %r14b, %al -; CHECK-BASELINE-NEXT: notb %r14b -; CHECK-BASELINE-NEXT: andb 10(%rdx), %r14b -; CHECK-BASELINE-NEXT: orb %al, %r14b -; CHECK-BASELINE-NEXT: movb 9(%r15), %bpl -; CHECK-BASELINE-NEXT: movb 9(%r12), %al -; CHECK-BASELINE-NEXT: andb %bpl, %al -; CHECK-BASELINE-NEXT: notb %bpl -; CHECK-BASELINE-NEXT: andb 9(%rdx), %bpl -; CHECK-BASELINE-NEXT: orb %al, %bpl -; CHECK-BASELINE-NEXT: movb 8(%r15), %r11b -; CHECK-BASELINE-NEXT: movb 8(%r12), %al -; CHECK-BASELINE-NEXT: andb %r11b, %al -; CHECK-BASELINE-NEXT: notb %r11b -; CHECK-BASELINE-NEXT: andb 8(%rdx), %r11b -; CHECK-BASELINE-NEXT: orb %al, %r11b -; CHECK-BASELINE-NEXT: movb 7(%r15), %r10b -; CHECK-BASELINE-NEXT: movb 7(%r12), %al -; CHECK-BASELINE-NEXT: andb %r10b, %al -; CHECK-BASELINE-NEXT: notb %r10b -; CHECK-BASELINE-NEXT: andb 7(%rdx), %r10b -; CHECK-BASELINE-NEXT: orb %al, %r10b -; CHECK-BASELINE-NEXT: movb 6(%r15), %r9b -; CHECK-BASELINE-NEXT: movb 6(%r12), %al -; CHECK-BASELINE-NEXT: andb %r9b, %al -; CHECK-BASELINE-NEXT: notb %r9b -; CHECK-BASELINE-NEXT: andb 6(%rdx), %r9b -; CHECK-BASELINE-NEXT: orb %al, %r9b -; CHECK-BASELINE-NEXT: movb 5(%r15), %r8b -; CHECK-BASELINE-NEXT: movb 5(%r12), %al -; CHECK-BASELINE-NEXT: andb %r8b, %al -; CHECK-BASELINE-NEXT: notb %r8b -; CHECK-BASELINE-NEXT: andb 5(%rdx), %r8b -; CHECK-BASELINE-NEXT: orb %al, %r8b -; CHECK-BASELINE-NEXT: movb 4(%r15), %dil -; CHECK-BASELINE-NEXT: movb 4(%r12), %al -; CHECK-BASELINE-NEXT: andb %dil, %al -; CHECK-BASELINE-NEXT: notb %dil -; CHECK-BASELINE-NEXT: andb 4(%rdx), %dil -; CHECK-BASELINE-NEXT: orb %al, %dil -; CHECK-BASELINE-NEXT: movb 3(%r15), %sil -; CHECK-BASELINE-NEXT: movb 3(%r12), %al -; CHECK-BASELINE-NEXT: andb %sil, %al -; CHECK-BASELINE-NEXT: notb %sil -; CHECK-BASELINE-NEXT: andb 3(%rdx), %sil -; CHECK-BASELINE-NEXT: orb %al, %sil -; CHECK-BASELINE-NEXT: movb 2(%r15), %dl -; CHECK-BASELINE-NEXT: movb 2(%r12), %al -; CHECK-BASELINE-NEXT: andb %dl, %al -; CHECK-BASELINE-NEXT: notb %dl -; CHECK-BASELINE-NEXT: andb 2(%rbx), %dl -; CHECK-BASELINE-NEXT: orb %al, %dl -; CHECK-BASELINE-NEXT: movb 1(%r15), %al -; CHECK-BASELINE-NEXT: movb 1(%r12), %cl -; CHECK-BASELINE-NEXT: andb %al, %cl -; CHECK-BASELINE-NEXT: notb %al -; CHECK-BASELINE-NEXT: andb 1(%rbx), %al -; CHECK-BASELINE-NEXT: orb %cl, %al -; CHECK-BASELINE-NEXT: movb (%r15), %r15b -; CHECK-BASELINE-NEXT: movb (%r12), %cl -; CHECK-BASELINE-NEXT: andb %r15b, %cl -; CHECK-BASELINE-NEXT: notb %r15b -; CHECK-BASELINE-NEXT: andb (%rbx), %r15b -; CHECK-BASELINE-NEXT: orb %cl, %r15b -; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 31(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 30(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 29(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 28(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 27(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 26(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 25(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 24(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 23(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 22(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 21(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 20(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 19(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 18(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 17(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 16(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 15(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 14(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 13(%rcx) -; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-BASELINE-NEXT: movb %bl, 12(%rcx) -; CHECK-BASELINE-NEXT: movb %r13b, 11(%rcx) -; CHECK-BASELINE-NEXT: movb %r14b, 10(%rcx) -; CHECK-BASELINE-NEXT: movb %bpl, 9(%rcx) -; CHECK-BASELINE-NEXT: movb %r11b, 8(%rcx) -; CHECK-BASELINE-NEXT: movb %r10b, 7(%rcx) -; CHECK-BASELINE-NEXT: movb %r9b, 6(%rcx) -; CHECK-BASELINE-NEXT: movb %r8b, 5(%rcx) -; CHECK-BASELINE-NEXT: movb %dil, 4(%rcx) -; CHECK-BASELINE-NEXT: movb %sil, 3(%rcx) -; CHECK-BASELINE-NEXT: movb %dl, 2(%rcx) -; CHECK-BASELINE-NEXT: movb %al, 1(%rcx) -; CHECK-BASELINE-NEXT: movb %r15b, (%rcx) -; CHECK-BASELINE-NEXT: movq %rcx, %rax +; CHECK-BASELINE-NEXT: movb 21(%r8), %al +; CHECK-BASELINE-NEXT: movb 21(%r9), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: andb 21(%r10), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: movb 22(%r8), %al +; CHECK-BASELINE-NEXT: movb 22(%r9), %r12b +; CHECK-BASELINE-NEXT: xorb %al, %r12b +; CHECK-BASELINE-NEXT: andb 22(%r10), %r12b +; CHECK-BASELINE-NEXT: xorb %al, %r12b +; CHECK-BASELINE-NEXT: movb 23(%r8), %al +; CHECK-BASELINE-NEXT: movb 23(%r9), %r15b +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: andb 23(%r10), %r15b +; CHECK-BASELINE-NEXT: xorb %al, %r15b +; CHECK-BASELINE-NEXT: movb 24(%r8), %al +; CHECK-BASELINE-NEXT: movb 24(%r9), %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: movb 25(%r8), %al +; CHECK-BASELINE-NEXT: movb 25(%r9), %bpl +; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl +; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: movb 26(%r8), %al +; CHECK-BASELINE-NEXT: movb 26(%r9), %dil +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: andb 26(%r10), %dil +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: movb 27(%r8), %al +; CHECK-BASELINE-NEXT: movb 27(%r9), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb 27(%r10), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: movb 28(%r8), %al +; CHECK-BASELINE-NEXT: movb 28(%r9), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb 28(%r10), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movb 29(%r8), %al +; CHECK-BASELINE-NEXT: movb 29(%r9), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 29(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb 30(%r8), %bl +; CHECK-BASELINE-NEXT: movb 30(%r9), %al +; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: andb 30(%r10), %al +; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: movb 31(%r8), %r8b +; CHECK-BASELINE-NEXT: movb 31(%r9), %bl +; CHECK-BASELINE-NEXT: xorb %r8b, %bl +; CHECK-BASELINE-NEXT: andb 31(%r10), %bl +; CHECK-BASELINE-NEXT: xorb %r8b, %bl +; CHECK-BASELINE-NEXT: movb %bl, 31(%r11) +; CHECK-BASELINE-NEXT: movb %al, 30(%r11) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r11) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r11) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r11) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r11) +; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11) +; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11) +; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11) +; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11) +; CHECK-BASELINE-NEXT: movb %r13b, 21(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 20(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 19(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 18(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 17(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 16(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 15(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 14(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 13(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 12(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 11(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 10(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 9(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 8(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 7(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 6(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 5(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 4(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 3(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 2(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 1(%r11) +; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-BASELINE-NEXT: movb %al, (%r11) +; CHECK-BASELINE-NEXT: movq %r11, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1615,286 +1461,257 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r15 -; CHECK-SSE1-NEXT: movq %rsi, %r12 -; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE1-NEXT: movb 16(%rcx), %al +; CHECK-SSE1-NEXT: movq %rcx, %r10 +; CHECK-SSE1-NEXT: movq %rdx, %r8 +; CHECK-SSE1-NEXT: movq %rsi, %r9 +; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movb 15(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 17(%rcx), %al +; CHECK-SSE1-NEXT: movb 14(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 18(%rcx), %al +; CHECK-SSE1-NEXT: movb 13(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 19(%rcx), %al +; CHECK-SSE1-NEXT: movb 12(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 20(%rcx), %al +; CHECK-SSE1-NEXT: movb 11(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 21(%rcx), %r14b -; CHECK-SSE1-NEXT: movb 22(%rcx), %r9b -; CHECK-SSE1-NEXT: movb 23(%rcx), %r10b -; CHECK-SSE1-NEXT: movb 24(%rcx), %r11b -; CHECK-SSE1-NEXT: movb 25(%rcx), %bpl -; CHECK-SSE1-NEXT: movb 26(%rcx), %r13b -; CHECK-SSE1-NEXT: movb 27(%rcx), %r8b -; CHECK-SSE1-NEXT: movb 28(%rcx), %dil -; CHECK-SSE1-NEXT: movb 29(%rcx), %sil -; CHECK-SSE1-NEXT: movb 30(%rcx), %bl -; CHECK-SSE1-NEXT: movb 31(%rcx), %al -; CHECK-SSE1-NEXT: movb 31(%r12), %cl -; CHECK-SSE1-NEXT: andb %al, %cl -; CHECK-SSE1-NEXT: notb %al -; CHECK-SSE1-NEXT: andb 31(%rdx), %al -; CHECK-SSE1-NEXT: orb %cl, %al +; CHECK-SSE1-NEXT: movb 10(%rdx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 30(%r12), %al -; CHECK-SSE1-NEXT: andb %bl, %al -; CHECK-SSE1-NEXT: notb %bl -; CHECK-SSE1-NEXT: andb 30(%rdx), %bl -; CHECK-SSE1-NEXT: orb %al, %bl +; CHECK-SSE1-NEXT: movb 9(%rdx), %bpl +; CHECK-SSE1-NEXT: movb 8(%rdx), %r14b +; CHECK-SSE1-NEXT: movb 7(%rdx), %r15b +; CHECK-SSE1-NEXT: movb 6(%rdx), %r12b +; CHECK-SSE1-NEXT: movb 5(%rdx), %sil +; CHECK-SSE1-NEXT: movb 4(%rdx), %r13b +; CHECK-SSE1-NEXT: movb 3(%rdx), %dl +; CHECK-SSE1-NEXT: movb 2(%r8), %dil +; CHECK-SSE1-NEXT: movb (%r8), %al +; CHECK-SSE1-NEXT: movb 1(%r8), %cl +; CHECK-SSE1-NEXT: movb (%r9), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb (%r10), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl ; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 29(%r12), %al -; CHECK-SSE1-NEXT: andb %sil, %al -; CHECK-SSE1-NEXT: notb %sil -; CHECK-SSE1-NEXT: andb 29(%rdx), %sil -; CHECK-SSE1-NEXT: orb %al, %sil -; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 28(%r12), %al -; CHECK-SSE1-NEXT: andb %dil, %al -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: andb 28(%rdx), %dil -; CHECK-SSE1-NEXT: orb %al, %dil -; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 27(%r12), %al -; CHECK-SSE1-NEXT: andb %r8b, %al -; CHECK-SSE1-NEXT: notb %r8b -; CHECK-SSE1-NEXT: andb 27(%rdx), %r8b -; CHECK-SSE1-NEXT: orb %al, %r8b -; CHECK-SSE1-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 26(%r12), %al -; CHECK-SSE1-NEXT: andb %r13b, %al -; CHECK-SSE1-NEXT: notb %r13b -; CHECK-SSE1-NEXT: andb 26(%rdx), %r13b -; CHECK-SSE1-NEXT: orb %al, %r13b -; CHECK-SSE1-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 25(%r12), %al -; CHECK-SSE1-NEXT: andb %bpl, %al -; CHECK-SSE1-NEXT: notb %bpl -; CHECK-SSE1-NEXT: andb 25(%rdx), %bpl -; CHECK-SSE1-NEXT: orb %al, %bpl -; CHECK-SSE1-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 24(%r12), %al -; CHECK-SSE1-NEXT: andb %r11b, %al -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: andb 24(%rdx), %r11b -; CHECK-SSE1-NEXT: orb %al, %r11b -; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 23(%r12), %al -; CHECK-SSE1-NEXT: andb %r10b, %al -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: andb 23(%rdx), %r10b -; CHECK-SSE1-NEXT: orb %al, %r10b -; CHECK-SSE1-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 22(%r12), %al -; CHECK-SSE1-NEXT: andb %r9b, %al -; CHECK-SSE1-NEXT: notb %r9b -; CHECK-SSE1-NEXT: andb 22(%rdx), %r9b -; CHECK-SSE1-NEXT: orb %al, %r9b -; CHECK-SSE1-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 21(%r12), %al -; CHECK-SSE1-NEXT: andb %r14b, %al -; CHECK-SSE1-NEXT: notb %r14b -; CHECK-SSE1-NEXT: andb 21(%rdx), %r14b -; CHECK-SSE1-NEXT: orb %al, %r14b -; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 20(%r12), %al +; CHECK-SSE1-NEXT: movb 1(%r9), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 1(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 2(%r9), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 2(%r10), %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 3(%r9), %al +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: andb 3(%r10), %al +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 4(%r9), %al +; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: andb 4(%r10), %al +; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 5(%r9), %al +; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: andb 5(%r10), %al +; CHECK-SSE1-NEXT: xorb %sil, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 6(%r9), %al +; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: andb 6(%r10), %al +; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 7(%r9), %al +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: andb 7(%r10), %al +; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 8(%r9), %al +; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: andb 8(%r10), %al +; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 9(%r9), %al +; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: andb 9(%r10), %al +; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 10(%r9), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 20(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 19(%r12), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 10(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 11(%r9), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 19(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 18(%r12), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 11(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 12(%r9), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 18(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 17(%r12), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 12(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 13(%r9), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: movq %rdx, %rbx -; CHECK-SSE1-NEXT: andb 17(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 16(%r12), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 13(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 14(%r9), %al +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 14(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 15(%r9), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 16(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 15(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 16(%r8), %al +; CHECK-SSE1-NEXT: movb 16(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 16(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 15(%r15), %cl -; CHECK-SSE1-NEXT: movb 15(%r12), %al -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 15(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl +; CHECK-SSE1-NEXT: movb 17(%r8), %al +; CHECK-SSE1-NEXT: movb 17(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 17(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 14(%r15), %cl -; CHECK-SSE1-NEXT: movb 14(%r12), %al -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 14(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl +; CHECK-SSE1-NEXT: movb 18(%r8), %al +; CHECK-SSE1-NEXT: movb 18(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 18(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 13(%r15), %cl -; CHECK-SSE1-NEXT: movb 13(%r12), %al -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 13(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl +; CHECK-SSE1-NEXT: movb 19(%r8), %al +; CHECK-SSE1-NEXT: movb 19(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 19(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 12(%r15), %cl -; CHECK-SSE1-NEXT: movb 12(%r12), %al -; CHECK-SSE1-NEXT: andb %cl, %al -; CHECK-SSE1-NEXT: notb %cl -; CHECK-SSE1-NEXT: andb 12(%rdx), %cl -; CHECK-SSE1-NEXT: orb %al, %cl +; CHECK-SSE1-NEXT: movb 20(%r8), %al +; CHECK-SSE1-NEXT: movb 20(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 20(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 11(%r15), %r13b -; CHECK-SSE1-NEXT: movb 11(%r12), %al -; CHECK-SSE1-NEXT: andb %r13b, %al -; CHECK-SSE1-NEXT: notb %r13b -; CHECK-SSE1-NEXT: andb 11(%rdx), %r13b -; CHECK-SSE1-NEXT: orb %al, %r13b -; CHECK-SSE1-NEXT: movb 10(%r15), %r14b -; CHECK-SSE1-NEXT: movb 10(%r12), %al -; CHECK-SSE1-NEXT: andb %r14b, %al -; CHECK-SSE1-NEXT: notb %r14b -; CHECK-SSE1-NEXT: andb 10(%rdx), %r14b -; CHECK-SSE1-NEXT: orb %al, %r14b -; CHECK-SSE1-NEXT: movb 9(%r15), %bpl -; CHECK-SSE1-NEXT: movb 9(%r12), %al -; CHECK-SSE1-NEXT: andb %bpl, %al -; CHECK-SSE1-NEXT: notb %bpl -; CHECK-SSE1-NEXT: andb 9(%rdx), %bpl -; CHECK-SSE1-NEXT: orb %al, %bpl -; CHECK-SSE1-NEXT: movb 8(%r15), %r11b -; CHECK-SSE1-NEXT: movb 8(%r12), %al -; CHECK-SSE1-NEXT: andb %r11b, %al -; CHECK-SSE1-NEXT: notb %r11b -; CHECK-SSE1-NEXT: andb 8(%rdx), %r11b -; CHECK-SSE1-NEXT: orb %al, %r11b -; CHECK-SSE1-NEXT: movb 7(%r15), %r10b -; CHECK-SSE1-NEXT: movb 7(%r12), %al -; CHECK-SSE1-NEXT: andb %r10b, %al -; CHECK-SSE1-NEXT: notb %r10b -; CHECK-SSE1-NEXT: andb 7(%rdx), %r10b -; CHECK-SSE1-NEXT: orb %al, %r10b -; CHECK-SSE1-NEXT: movb 6(%r15), %r9b -; CHECK-SSE1-NEXT: movb 6(%r12), %al -; CHECK-SSE1-NEXT: andb %r9b, %al -; CHECK-SSE1-NEXT: notb %r9b -; CHECK-SSE1-NEXT: andb 6(%rdx), %r9b -; CHECK-SSE1-NEXT: orb %al, %r9b -; CHECK-SSE1-NEXT: movb 5(%r15), %r8b -; CHECK-SSE1-NEXT: movb 5(%r12), %al -; CHECK-SSE1-NEXT: andb %r8b, %al -; CHECK-SSE1-NEXT: notb %r8b -; CHECK-SSE1-NEXT: andb 5(%rdx), %r8b -; CHECK-SSE1-NEXT: orb %al, %r8b -; CHECK-SSE1-NEXT: movb 4(%r15), %dil -; CHECK-SSE1-NEXT: movb 4(%r12), %al -; CHECK-SSE1-NEXT: andb %dil, %al -; CHECK-SSE1-NEXT: notb %dil -; CHECK-SSE1-NEXT: andb 4(%rdx), %dil -; CHECK-SSE1-NEXT: orb %al, %dil -; CHECK-SSE1-NEXT: movb 3(%r15), %sil -; CHECK-SSE1-NEXT: movb 3(%r12), %al -; CHECK-SSE1-NEXT: andb %sil, %al -; CHECK-SSE1-NEXT: notb %sil -; CHECK-SSE1-NEXT: andb 3(%rdx), %sil -; CHECK-SSE1-NEXT: orb %al, %sil -; CHECK-SSE1-NEXT: movb 2(%r15), %dl -; CHECK-SSE1-NEXT: movb 2(%r12), %al -; CHECK-SSE1-NEXT: andb %dl, %al -; CHECK-SSE1-NEXT: notb %dl -; CHECK-SSE1-NEXT: andb 2(%rbx), %dl -; CHECK-SSE1-NEXT: orb %al, %dl -; CHECK-SSE1-NEXT: movb 1(%r15), %al -; CHECK-SSE1-NEXT: movb 1(%r12), %cl -; CHECK-SSE1-NEXT: andb %al, %cl -; CHECK-SSE1-NEXT: notb %al -; CHECK-SSE1-NEXT: andb 1(%rbx), %al -; CHECK-SSE1-NEXT: orb %cl, %al -; CHECK-SSE1-NEXT: movb (%r15), %r15b -; CHECK-SSE1-NEXT: movb (%r12), %cl -; CHECK-SSE1-NEXT: andb %r15b, %cl -; CHECK-SSE1-NEXT: notb %r15b -; CHECK-SSE1-NEXT: andb (%rbx), %r15b -; CHECK-SSE1-NEXT: orb %cl, %r15b -; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 31(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 30(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 29(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 28(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 27(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 26(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 25(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 24(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 23(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 22(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 21(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 20(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 19(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 18(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 17(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 16(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 15(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 14(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 13(%rcx) -; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload -; CHECK-SSE1-NEXT: movb %bl, 12(%rcx) -; CHECK-SSE1-NEXT: movb %r13b, 11(%rcx) -; CHECK-SSE1-NEXT: movb %r14b, 10(%rcx) -; CHECK-SSE1-NEXT: movb %bpl, 9(%rcx) -; CHECK-SSE1-NEXT: movb %r11b, 8(%rcx) -; CHECK-SSE1-NEXT: movb %r10b, 7(%rcx) -; CHECK-SSE1-NEXT: movb %r9b, 6(%rcx) -; CHECK-SSE1-NEXT: movb %r8b, 5(%rcx) -; CHECK-SSE1-NEXT: movb %dil, 4(%rcx) -; CHECK-SSE1-NEXT: movb %sil, 3(%rcx) -; CHECK-SSE1-NEXT: movb %dl, 2(%rcx) -; CHECK-SSE1-NEXT: movb %al, 1(%rcx) -; CHECK-SSE1-NEXT: movb %r15b, (%rcx) -; CHECK-SSE1-NEXT: movq %rcx, %rax +; CHECK-SSE1-NEXT: movb 21(%r8), %al +; CHECK-SSE1-NEXT: movb 21(%r9), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: andb 21(%r10), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: movb 22(%r8), %al +; CHECK-SSE1-NEXT: movb 22(%r9), %r12b +; CHECK-SSE1-NEXT: xorb %al, %r12b +; CHECK-SSE1-NEXT: andb 22(%r10), %r12b +; CHECK-SSE1-NEXT: xorb %al, %r12b +; CHECK-SSE1-NEXT: movb 23(%r8), %al +; CHECK-SSE1-NEXT: movb 23(%r9), %r15b +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: andb 23(%r10), %r15b +; CHECK-SSE1-NEXT: xorb %al, %r15b +; CHECK-SSE1-NEXT: movb 24(%r8), %al +; CHECK-SSE1-NEXT: movb 24(%r9), %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: andb 24(%r10), %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: movb 25(%r8), %al +; CHECK-SSE1-NEXT: movb 25(%r9), %bpl +; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: andb 25(%r10), %bpl +; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: movb 26(%r8), %al +; CHECK-SSE1-NEXT: movb 26(%r9), %dil +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: andb 26(%r10), %dil +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: movb 27(%r8), %al +; CHECK-SSE1-NEXT: movb 27(%r9), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb 27(%r10), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: movb 28(%r8), %al +; CHECK-SSE1-NEXT: movb 28(%r9), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb 28(%r10), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movb 29(%r8), %al +; CHECK-SSE1-NEXT: movb 29(%r9), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 29(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb 30(%r8), %bl +; CHECK-SSE1-NEXT: movb 30(%r9), %al +; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: andb 30(%r10), %al +; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: movb 31(%r8), %r8b +; CHECK-SSE1-NEXT: movb 31(%r9), %bl +; CHECK-SSE1-NEXT: xorb %r8b, %bl +; CHECK-SSE1-NEXT: andb 31(%r10), %bl +; CHECK-SSE1-NEXT: xorb %r8b, %bl +; CHECK-SSE1-NEXT: movb %bl, 31(%r11) +; CHECK-SSE1-NEXT: movb %al, 30(%r11) +; CHECK-SSE1-NEXT: movb %cl, 29(%r11) +; CHECK-SSE1-NEXT: movb %dl, 28(%r11) +; CHECK-SSE1-NEXT: movb %sil, 27(%r11) +; CHECK-SSE1-NEXT: movb %dil, 26(%r11) +; CHECK-SSE1-NEXT: movb %bpl, 25(%r11) +; CHECK-SSE1-NEXT: movb %r14b, 24(%r11) +; CHECK-SSE1-NEXT: movb %r15b, 23(%r11) +; CHECK-SSE1-NEXT: movb %r12b, 22(%r11) +; CHECK-SSE1-NEXT: movb %r13b, 21(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 20(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 19(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 18(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 17(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 16(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 15(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 14(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 13(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 12(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 11(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 10(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 9(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 8(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 7(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 6(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 5(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 4(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 3(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 2(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, 1(%r11) +; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-SSE1-NEXT: movb %al, (%r11) +; CHECK-SSE1-NEXT: movq %r11, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -1942,135 +1759,113 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r9 -; CHECK-BASELINE-NEXT: movq %rdx, %r10 -; CHECK-BASELINE-NEXT: movq %rsi, %r8 -; CHECK-BASELINE-NEXT: movq %rdi, %r11 -; CHECK-BASELINE-NEXT: movl 12(%rcx), %eax -; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 14(%rcx), %edx -; CHECK-BASELINE-NEXT: movl 16(%rcx), %esi -; CHECK-BASELINE-NEXT: movzwl 18(%rcx), %edi -; CHECK-BASELINE-NEXT: movl 20(%rcx), %ecx -; CHECK-BASELINE-NEXT: movzwl 22(%r9), %r15d -; CHECK-BASELINE-NEXT: movl 24(%r9), %r12d -; CHECK-BASELINE-NEXT: movzwl 26(%r9), %r14d -; CHECK-BASELINE-NEXT: movl 28(%r9), %ebx -; CHECK-BASELINE-NEXT: movzwl 30(%r9), %ebp -; CHECK-BASELINE-NEXT: movzwl 30(%r8), %r13d -; CHECK-BASELINE-NEXT: andw %bp, %r13w -; CHECK-BASELINE-NEXT: notl %ebp -; CHECK-BASELINE-NEXT: andw 30(%r10), %bp -; CHECK-BASELINE-NEXT: orl %r13d, %ebp -; CHECK-BASELINE-NEXT: movzwl 28(%r8), %eax -; CHECK-BASELINE-NEXT: andw %bx, %ax -; CHECK-BASELINE-NEXT: notl %ebx -; CHECK-BASELINE-NEXT: andw 28(%r10), %bx -; CHECK-BASELINE-NEXT: orl %eax, %ebx -; CHECK-BASELINE-NEXT: movzwl 26(%r8), %eax -; CHECK-BASELINE-NEXT: andw %r14w, %ax -; CHECK-BASELINE-NEXT: notl %r14d -; CHECK-BASELINE-NEXT: andw 26(%r10), %r14w -; CHECK-BASELINE-NEXT: orl %eax, %r14d -; CHECK-BASELINE-NEXT: movzwl 24(%r8), %eax -; CHECK-BASELINE-NEXT: andw %r12w, %ax -; CHECK-BASELINE-NEXT: notl %r12d -; CHECK-BASELINE-NEXT: andw 24(%r10), %r12w -; CHECK-BASELINE-NEXT: orl %eax, %r12d -; CHECK-BASELINE-NEXT: movzwl 22(%r8), %eax -; CHECK-BASELINE-NEXT: andw %r15w, %ax -; CHECK-BASELINE-NEXT: notl %r15d -; CHECK-BASELINE-NEXT: andw 22(%r10), %r15w -; CHECK-BASELINE-NEXT: orl %eax, %r15d -; CHECK-BASELINE-NEXT: movzwl 20(%r8), %eax -; CHECK-BASELINE-NEXT: andw %cx, %ax -; CHECK-BASELINE-NEXT: notl %ecx -; CHECK-BASELINE-NEXT: andw 20(%r10), %cx -; CHECK-BASELINE-NEXT: orl %eax, %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 18(%r8), %eax -; CHECK-BASELINE-NEXT: andw %di, %ax -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: andw 18(%r10), %di -; CHECK-BASELINE-NEXT: orl %eax, %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 16(%r8), %eax -; CHECK-BASELINE-NEXT: andw %si, %ax -; CHECK-BASELINE-NEXT: notl %esi -; CHECK-BASELINE-NEXT: andw 16(%r10), %si -; CHECK-BASELINE-NEXT: orl %eax, %esi -; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 14(%r8), %eax -; CHECK-BASELINE-NEXT: andw %dx, %ax -; CHECK-BASELINE-NEXT: notl %edx -; CHECK-BASELINE-NEXT: andw 14(%r10), %dx -; CHECK-BASELINE-NEXT: orl %eax, %edx -; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 12(%r8), %eax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: andw %cx, %ax -; CHECK-BASELINE-NEXT: notl %ecx -; CHECK-BASELINE-NEXT: andw 12(%r10), %cx -; CHECK-BASELINE-NEXT: orl %eax, %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%r9), %r13d -; CHECK-BASELINE-NEXT: movzwl 10(%r8), %eax -; CHECK-BASELINE-NEXT: andw %r13w, %ax -; CHECK-BASELINE-NEXT: notl %r13d -; CHECK-BASELINE-NEXT: andw 10(%r10), %r13w -; CHECK-BASELINE-NEXT: orl %eax, %r13d -; CHECK-BASELINE-NEXT: movl 8(%r9), %edi -; CHECK-BASELINE-NEXT: movzwl 8(%r8), %eax -; CHECK-BASELINE-NEXT: andw %di, %ax -; CHECK-BASELINE-NEXT: notl %edi -; CHECK-BASELINE-NEXT: andw 8(%r10), %di -; CHECK-BASELINE-NEXT: orl %eax, %edi -; CHECK-BASELINE-NEXT: movzwl 6(%r9), %esi -; CHECK-BASELINE-NEXT: movzwl 6(%r8), %eax -; CHECK-BASELINE-NEXT: andw %si, %ax -; CHECK-BASELINE-NEXT: notl %esi -; CHECK-BASELINE-NEXT: andw 6(%r10), %si -; CHECK-BASELINE-NEXT: orl %eax, %esi -; CHECK-BASELINE-NEXT: movl 4(%r9), %edx -; CHECK-BASELINE-NEXT: movzwl 4(%r8), %eax -; CHECK-BASELINE-NEXT: andw %dx, %ax -; CHECK-BASELINE-NEXT: notl %edx -; CHECK-BASELINE-NEXT: andw 4(%r10), %dx -; CHECK-BASELINE-NEXT: orl %eax, %edx -; CHECK-BASELINE-NEXT: movzwl 2(%r9), %eax -; CHECK-BASELINE-NEXT: movzwl 2(%r8), %ecx -; CHECK-BASELINE-NEXT: andw %ax, %cx -; CHECK-BASELINE-NEXT: notl %eax -; CHECK-BASELINE-NEXT: andw 2(%r10), %ax -; CHECK-BASELINE-NEXT: orl %ecx, %eax -; CHECK-BASELINE-NEXT: movl (%r9), %r9d -; CHECK-BASELINE-NEXT: movzwl (%r8), %ecx -; CHECK-BASELINE-NEXT: andw %r9w, %cx -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andw (%r10), %r9w -; CHECK-BASELINE-NEXT: orl %ecx, %r9d -; CHECK-BASELINE-NEXT: movw %bp, 30(%r11) -; CHECK-BASELINE-NEXT: movw %bx, 28(%r11) -; CHECK-BASELINE-NEXT: movw %r14w, 26(%r11) -; CHECK-BASELINE-NEXT: movw %r12w, 24(%r11) -; CHECK-BASELINE-NEXT: movw %r15w, 22(%r11) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %cx, 20(%r11) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %cx, 18(%r11) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %cx, 16(%r11) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %cx, 14(%r11) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %cx, 12(%r11) -; CHECK-BASELINE-NEXT: movw %r13w, 10(%r11) -; CHECK-BASELINE-NEXT: movw %di, 8(%r11) -; CHECK-BASELINE-NEXT: movw %si, 6(%r11) -; CHECK-BASELINE-NEXT: movw %dx, 4(%r11) -; CHECK-BASELINE-NEXT: movw %ax, 2(%r11) -; CHECK-BASELINE-NEXT: movw %r9w, (%r11) -; CHECK-BASELINE-NEXT: movq %r11, %rax +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzwl (%rdx), %ebp +; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %bp, %ax +; CHECK-BASELINE-NEXT: andw (%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %ebp +; CHECK-BASELINE-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r10w, %ax +; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r10d +; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r12w, %ax +; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r12d +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %bx, %ax +; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %ebx +; CHECK-BASELINE-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax +; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax +; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r11w, %ax +; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r11d +; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r14w, %ax +; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r14d +; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r15w, %ax +; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r15d +; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %bx, %ax +; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %ebx +; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax +; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax +; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorw %ax, %r11w +; CHECK-BASELINE-NEXT: andw 26(%rcx), %r11w +; CHECK-BASELINE-NEXT: xorl %r11d, %eax +; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorw %r11w, %bp +; CHECK-BASELINE-NEXT: andw 28(%rcx), %bp +; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx +; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi +; CHECK-BASELINE-NEXT: xorw %dx, %si +; CHECK-BASELINE-NEXT: andw 30(%rcx), %si +; CHECK-BASELINE-NEXT: xorl %esi, %edx +; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) +; CHECK-BASELINE-NEXT: movw %r11w, 28(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) +; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) +; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) +; CHECK-BASELINE-NEXT: movw %bx, 20(%rdi) +; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) +; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, 14(%rdi) +; CHECK-BASELINE-NEXT: movw %r13w, 12(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %r12w, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 2(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %ax, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -2087,135 +1882,113 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r9 -; CHECK-SSE1-NEXT: movq %rdx, %r10 -; CHECK-SSE1-NEXT: movq %rsi, %r8 -; CHECK-SSE1-NEXT: movq %rdi, %r11 -; CHECK-SSE1-NEXT: movl 12(%rcx), %eax -; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 14(%rcx), %edx -; CHECK-SSE1-NEXT: movl 16(%rcx), %esi -; CHECK-SSE1-NEXT: movzwl 18(%rcx), %edi -; CHECK-SSE1-NEXT: movl 20(%rcx), %ecx -; CHECK-SSE1-NEXT: movzwl 22(%r9), %r15d -; CHECK-SSE1-NEXT: movl 24(%r9), %r12d -; CHECK-SSE1-NEXT: movzwl 26(%r9), %r14d -; CHECK-SSE1-NEXT: movl 28(%r9), %ebx -; CHECK-SSE1-NEXT: movzwl 30(%r9), %ebp -; CHECK-SSE1-NEXT: movzwl 30(%r8), %r13d -; CHECK-SSE1-NEXT: andw %bp, %r13w -; CHECK-SSE1-NEXT: notl %ebp -; CHECK-SSE1-NEXT: andw 30(%r10), %bp -; CHECK-SSE1-NEXT: orl %r13d, %ebp -; CHECK-SSE1-NEXT: movzwl 28(%r8), %eax -; CHECK-SSE1-NEXT: andw %bx, %ax -; CHECK-SSE1-NEXT: notl %ebx -; CHECK-SSE1-NEXT: andw 28(%r10), %bx -; CHECK-SSE1-NEXT: orl %eax, %ebx -; CHECK-SSE1-NEXT: movzwl 26(%r8), %eax -; CHECK-SSE1-NEXT: andw %r14w, %ax -; CHECK-SSE1-NEXT: notl %r14d -; CHECK-SSE1-NEXT: andw 26(%r10), %r14w -; CHECK-SSE1-NEXT: orl %eax, %r14d -; CHECK-SSE1-NEXT: movzwl 24(%r8), %eax -; CHECK-SSE1-NEXT: andw %r12w, %ax -; CHECK-SSE1-NEXT: notl %r12d -; CHECK-SSE1-NEXT: andw 24(%r10), %r12w -; CHECK-SSE1-NEXT: orl %eax, %r12d -; CHECK-SSE1-NEXT: movzwl 22(%r8), %eax -; CHECK-SSE1-NEXT: andw %r15w, %ax -; CHECK-SSE1-NEXT: notl %r15d -; CHECK-SSE1-NEXT: andw 22(%r10), %r15w -; CHECK-SSE1-NEXT: orl %eax, %r15d -; CHECK-SSE1-NEXT: movzwl 20(%r8), %eax -; CHECK-SSE1-NEXT: andw %cx, %ax -; CHECK-SSE1-NEXT: notl %ecx -; CHECK-SSE1-NEXT: andw 20(%r10), %cx -; CHECK-SSE1-NEXT: orl %eax, %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 18(%r8), %eax -; CHECK-SSE1-NEXT: andw %di, %ax -; CHECK-SSE1-NEXT: notl %edi -; CHECK-SSE1-NEXT: andw 18(%r10), %di -; CHECK-SSE1-NEXT: orl %eax, %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 16(%r8), %eax -; CHECK-SSE1-NEXT: andw %si, %ax -; CHECK-SSE1-NEXT: notl %esi -; CHECK-SSE1-NEXT: andw 16(%r10), %si -; CHECK-SSE1-NEXT: orl %eax, %esi -; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 14(%r8), %eax -; CHECK-SSE1-NEXT: andw %dx, %ax -; CHECK-SSE1-NEXT: notl %edx -; CHECK-SSE1-NEXT: andw 14(%r10), %dx -; CHECK-SSE1-NEXT: orl %eax, %edx -; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 12(%r8), %eax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: andw %cx, %ax -; CHECK-SSE1-NEXT: notl %ecx -; CHECK-SSE1-NEXT: andw 12(%r10), %cx -; CHECK-SSE1-NEXT: orl %eax, %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%r9), %r13d -; CHECK-SSE1-NEXT: movzwl 10(%r8), %eax -; CHECK-SSE1-NEXT: andw %r13w, %ax -; CHECK-SSE1-NEXT: notl %r13d -; CHECK-SSE1-NEXT: andw 10(%r10), %r13w -; CHECK-SSE1-NEXT: orl %eax, %r13d -; CHECK-SSE1-NEXT: movl 8(%r9), %edi -; CHECK-SSE1-NEXT: movzwl 8(%r8), %eax -; CHECK-SSE1-NEXT: andw %di, %ax -; CHECK-SSE1-NEXT: notl %edi -; CHECK-SSE1-NEXT: andw 8(%r10), %di -; CHECK-SSE1-NEXT: orl %eax, %edi -; CHECK-SSE1-NEXT: movzwl 6(%r9), %esi -; CHECK-SSE1-NEXT: movzwl 6(%r8), %eax -; CHECK-SSE1-NEXT: andw %si, %ax -; CHECK-SSE1-NEXT: notl %esi -; CHECK-SSE1-NEXT: andw 6(%r10), %si -; CHECK-SSE1-NEXT: orl %eax, %esi -; CHECK-SSE1-NEXT: movl 4(%r9), %edx -; CHECK-SSE1-NEXT: movzwl 4(%r8), %eax -; CHECK-SSE1-NEXT: andw %dx, %ax -; CHECK-SSE1-NEXT: notl %edx -; CHECK-SSE1-NEXT: andw 4(%r10), %dx -; CHECK-SSE1-NEXT: orl %eax, %edx -; CHECK-SSE1-NEXT: movzwl 2(%r9), %eax -; CHECK-SSE1-NEXT: movzwl 2(%r8), %ecx -; CHECK-SSE1-NEXT: andw %ax, %cx -; CHECK-SSE1-NEXT: notl %eax -; CHECK-SSE1-NEXT: andw 2(%r10), %ax -; CHECK-SSE1-NEXT: orl %ecx, %eax -; CHECK-SSE1-NEXT: movl (%r9), %r9d -; CHECK-SSE1-NEXT: movzwl (%r8), %ecx -; CHECK-SSE1-NEXT: andw %r9w, %cx -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andw (%r10), %r9w -; CHECK-SSE1-NEXT: orl %ecx, %r9d -; CHECK-SSE1-NEXT: movw %bp, 30(%r11) -; CHECK-SSE1-NEXT: movw %bx, 28(%r11) -; CHECK-SSE1-NEXT: movw %r14w, 26(%r11) -; CHECK-SSE1-NEXT: movw %r12w, 24(%r11) -; CHECK-SSE1-NEXT: movw %r15w, 22(%r11) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: movw %cx, 20(%r11) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: movw %cx, 18(%r11) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: movw %cx, 16(%r11) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: movw %cx, 14(%r11) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: movw %cx, 12(%r11) -; CHECK-SSE1-NEXT: movw %r13w, 10(%r11) -; CHECK-SSE1-NEXT: movw %di, 8(%r11) -; CHECK-SSE1-NEXT: movw %si, 6(%r11) -; CHECK-SSE1-NEXT: movw %dx, 4(%r11) -; CHECK-SSE1-NEXT: movw %ax, 2(%r11) -; CHECK-SSE1-NEXT: movw %r9w, (%r11) -; CHECK-SSE1-NEXT: movq %r11, %rax +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d +; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d +; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r11d +; CHECK-SSE1-NEXT: movzwl 12(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r12d +; CHECK-SSE1-NEXT: movzwl (%rdx), %ebp +; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl (%rsi), %eax +; CHECK-SSE1-NEXT: xorw %bp, %ax +; CHECK-SSE1-NEXT: andw (%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %ebp +; CHECK-SSE1-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r10w, %ax +; CHECK-SSE1-NEXT: andw 2(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r10d +; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r12w, %ax +; CHECK-SSE1-NEXT: andw 4(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r12d +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %bx, %ax +; CHECK-SSE1-NEXT: andw 6(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %ebx +; CHECK-SSE1-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: andw 8(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: andw 10(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r13w, %ax +; CHECK-SSE1-NEXT: andw 12(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r11w, %ax +; CHECK-SSE1-NEXT: andw 14(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r11d +; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r14w, %ax +; CHECK-SSE1-NEXT: andw 16(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r14d +; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r15w, %ax +; CHECK-SSE1-NEXT: andw 18(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r15d +; CHECK-SSE1-NEXT: movzwl 20(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %bx, %ax +; CHECK-SSE1-NEXT: andw 20(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %ebx +; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: andw 22(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: andw 24(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax +; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r11d +; CHECK-SSE1-NEXT: xorw %ax, %r11w +; CHECK-SSE1-NEXT: andw 26(%rcx), %r11w +; CHECK-SSE1-NEXT: xorl %r11d, %eax +; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r11d +; CHECK-SSE1-NEXT: movzwl 28(%rsi), %ebp +; CHECK-SSE1-NEXT: xorw %r11w, %bp +; CHECK-SSE1-NEXT: andw 28(%rcx), %bp +; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx +; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi +; CHECK-SSE1-NEXT: xorw %dx, %si +; CHECK-SSE1-NEXT: andw 30(%rcx), %si +; CHECK-SSE1-NEXT: xorl %esi, %edx +; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) +; CHECK-SSE1-NEXT: movw %r11w, 28(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) +; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) +; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) +; CHECK-SSE1-NEXT: movw %bx, 20(%rdi) +; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) +; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, 14(%rdi) +; CHECK-SSE1-NEXT: movw %r13w, 12(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) +; CHECK-SSE1-NEXT: movw %r12w, 4(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 2(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movw %ax, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -2260,65 +2033,59 @@ ; CHECK-BASELINE-NEXT: pushq %rbp ; CHECK-BASELINE-NEXT: pushq %r15 ; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 4(%rcx), %r8d -; CHECK-BASELINE-NEXT: movl 8(%rcx), %r9d -; CHECK-BASELINE-NEXT: movl 12(%rcx), %r10d -; CHECK-BASELINE-NEXT: movl 16(%rcx), %r11d -; CHECK-BASELINE-NEXT: movl 20(%rcx), %r15d -; CHECK-BASELINE-NEXT: movl 24(%rcx), %ebx -; CHECK-BASELINE-NEXT: movl 28(%rcx), %ebp -; CHECK-BASELINE-NEXT: movl 28(%rsi), %r14d -; CHECK-BASELINE-NEXT: andl %ebp, %r14d -; CHECK-BASELINE-NEXT: notl %ebp -; CHECK-BASELINE-NEXT: andl 28(%rdx), %ebp -; CHECK-BASELINE-NEXT: orl %r14d, %ebp -; CHECK-BASELINE-NEXT: movl 24(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %ebx, %edi -; CHECK-BASELINE-NEXT: notl %ebx -; CHECK-BASELINE-NEXT: andl 24(%rdx), %ebx -; CHECK-BASELINE-NEXT: orl %edi, %ebx +; CHECK-BASELINE-NEXT: movl 28(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 24(%rdx), %r9d +; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d +; CHECK-BASELINE-NEXT: movl 16(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl (%rdx), %edi +; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movl (%rsi), %r11d +; CHECK-BASELINE-NEXT: xorl %edi, %r11d +; CHECK-BASELINE-NEXT: andl (%rcx), %r11d +; CHECK-BASELINE-NEXT: xorl %edi, %r11d +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r15d +; CHECK-BASELINE-NEXT: xorl %edx, %r15d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r15d +; CHECK-BASELINE-NEXT: xorl %edx, %r15d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorl %ebp, %r12d +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r12d +; CHECK-BASELINE-NEXT: xorl %ebp, %r12d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorl %ebx, %ebp +; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp +; CHECK-BASELINE-NEXT: xorl %ebx, %ebp +; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %r14d, %ebx +; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebx +; CHECK-BASELINE-NEXT: xorl %r14d, %ebx ; CHECK-BASELINE-NEXT: movl 20(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %r15d, %edi -; CHECK-BASELINE-NEXT: notl %r15d -; CHECK-BASELINE-NEXT: andl 20(%rdx), %r15d -; CHECK-BASELINE-NEXT: orl %edi, %r15d -; CHECK-BASELINE-NEXT: movl 16(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %r11d, %edi -; CHECK-BASELINE-NEXT: notl %r11d -; CHECK-BASELINE-NEXT: andl 16(%rdx), %r11d -; CHECK-BASELINE-NEXT: orl %edi, %r11d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %r10d, %edi -; CHECK-BASELINE-NEXT: notl %r10d -; CHECK-BASELINE-NEXT: andl 12(%rdx), %r10d -; CHECK-BASELINE-NEXT: orl %edi, %r10d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %r9d, %edi -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: orl %edi, %r9d -; CHECK-BASELINE-NEXT: movl 4(%rsi), %edi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notl %r8d -; CHECK-BASELINE-NEXT: andl 4(%rdx), %r8d -; CHECK-BASELINE-NEXT: orl %edi, %r8d -; CHECK-BASELINE-NEXT: movl (%rcx), %ecx -; CHECK-BASELINE-NEXT: movl (%rsi), %esi -; CHECK-BASELINE-NEXT: andl %ecx, %esi -; CHECK-BASELINE-NEXT: notl %ecx -; CHECK-BASELINE-NEXT: andl (%rdx), %ecx -; CHECK-BASELINE-NEXT: orl %esi, %ecx -; CHECK-BASELINE-NEXT: movl %ebp, 28(%rax) -; CHECK-BASELINE-NEXT: movl %ebx, 24(%rax) -; CHECK-BASELINE-NEXT: movl %r15d, 20(%rax) -; CHECK-BASELINE-NEXT: movl %r11d, 16(%rax) -; CHECK-BASELINE-NEXT: movl %r10d, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r9d, 8(%rax) -; CHECK-BASELINE-NEXT: movl %r8d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %ecx, (%rax) +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: andl 20(%rcx), %edi +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: movl 24(%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: andl 24(%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: movl %esi, 28(%rax) +; CHECK-BASELINE-NEXT: movl %edx, 24(%rax) +; CHECK-BASELINE-NEXT: movl %edi, 20(%rax) +; CHECK-BASELINE-NEXT: movl %ebx, 16(%rax) +; CHECK-BASELINE-NEXT: movl %ebp, 12(%rax) +; CHECK-BASELINE-NEXT: movl %r12d, 8(%rax) +; CHECK-BASELINE-NEXT: movl %r15d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %r11d, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r14 ; CHECK-BASELINE-NEXT: popq %r15 ; CHECK-BASELINE-NEXT: popq %rbp @@ -2329,65 +2096,59 @@ ; CHECK-SSE1-NEXT: pushq %rbp ; CHECK-SSE1-NEXT: pushq %r15 ; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl 4(%rcx), %r8d -; CHECK-SSE1-NEXT: movl 8(%rcx), %r9d -; CHECK-SSE1-NEXT: movl 12(%rcx), %r10d -; CHECK-SSE1-NEXT: movl 16(%rcx), %r11d -; CHECK-SSE1-NEXT: movl 20(%rcx), %r15d -; CHECK-SSE1-NEXT: movl 24(%rcx), %ebx -; CHECK-SSE1-NEXT: movl 28(%rcx), %ebp -; CHECK-SSE1-NEXT: movl 28(%rsi), %r14d -; CHECK-SSE1-NEXT: andl %ebp, %r14d -; CHECK-SSE1-NEXT: notl %ebp -; CHECK-SSE1-NEXT: andl 28(%rdx), %ebp -; CHECK-SSE1-NEXT: orl %r14d, %ebp -; CHECK-SSE1-NEXT: movl 24(%rsi), %edi -; CHECK-SSE1-NEXT: andl %ebx, %edi -; CHECK-SSE1-NEXT: notl %ebx -; CHECK-SSE1-NEXT: andl 24(%rdx), %ebx -; CHECK-SSE1-NEXT: orl %edi, %ebx +; CHECK-SSE1-NEXT: movl 28(%rdx), %r8d +; CHECK-SSE1-NEXT: movl 24(%rdx), %r9d +; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d +; CHECK-SSE1-NEXT: movl 16(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 12(%rdx), %ebx +; CHECK-SSE1-NEXT: movl 8(%rdx), %ebp +; CHECK-SSE1-NEXT: movl (%rdx), %edi +; CHECK-SSE1-NEXT: movl 4(%rdx), %edx +; CHECK-SSE1-NEXT: movl (%rsi), %r11d +; CHECK-SSE1-NEXT: xorl %edi, %r11d +; CHECK-SSE1-NEXT: andl (%rcx), %r11d +; CHECK-SSE1-NEXT: xorl %edi, %r11d +; CHECK-SSE1-NEXT: movl 4(%rsi), %r15d +; CHECK-SSE1-NEXT: xorl %edx, %r15d +; CHECK-SSE1-NEXT: andl 4(%rcx), %r15d +; CHECK-SSE1-NEXT: xorl %edx, %r15d +; CHECK-SSE1-NEXT: movl 8(%rsi), %r12d +; CHECK-SSE1-NEXT: xorl %ebp, %r12d +; CHECK-SSE1-NEXT: andl 8(%rcx), %r12d +; CHECK-SSE1-NEXT: xorl %ebp, %r12d +; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp +; CHECK-SSE1-NEXT: xorl %ebx, %ebp +; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp +; CHECK-SSE1-NEXT: xorl %ebx, %ebp +; CHECK-SSE1-NEXT: movl 16(%rsi), %ebx +; CHECK-SSE1-NEXT: xorl %r14d, %ebx +; CHECK-SSE1-NEXT: andl 16(%rcx), %ebx +; CHECK-SSE1-NEXT: xorl %r14d, %ebx ; CHECK-SSE1-NEXT: movl 20(%rsi), %edi -; CHECK-SSE1-NEXT: andl %r15d, %edi -; CHECK-SSE1-NEXT: notl %r15d -; CHECK-SSE1-NEXT: andl 20(%rdx), %r15d -; CHECK-SSE1-NEXT: orl %edi, %r15d -; CHECK-SSE1-NEXT: movl 16(%rsi), %edi -; CHECK-SSE1-NEXT: andl %r11d, %edi -; CHECK-SSE1-NEXT: notl %r11d -; CHECK-SSE1-NEXT: andl 16(%rdx), %r11d -; CHECK-SSE1-NEXT: orl %edi, %r11d -; CHECK-SSE1-NEXT: movl 12(%rsi), %edi -; CHECK-SSE1-NEXT: andl %r10d, %edi -; CHECK-SSE1-NEXT: notl %r10d -; CHECK-SSE1-NEXT: andl 12(%rdx), %r10d -; CHECK-SSE1-NEXT: orl %edi, %r10d -; CHECK-SSE1-NEXT: movl 8(%rsi), %edi -; CHECK-SSE1-NEXT: andl %r9d, %edi -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andl 8(%rdx), %r9d -; CHECK-SSE1-NEXT: orl %edi, %r9d -; CHECK-SSE1-NEXT: movl 4(%rsi), %edi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notl %r8d -; CHECK-SSE1-NEXT: andl 4(%rdx), %r8d -; CHECK-SSE1-NEXT: orl %edi, %r8d -; CHECK-SSE1-NEXT: movl (%rcx), %ecx -; CHECK-SSE1-NEXT: movl (%rsi), %esi -; CHECK-SSE1-NEXT: andl %ecx, %esi -; CHECK-SSE1-NEXT: notl %ecx -; CHECK-SSE1-NEXT: andl (%rdx), %ecx -; CHECK-SSE1-NEXT: orl %esi, %ecx -; CHECK-SSE1-NEXT: movl %ebp, 28(%rax) -; CHECK-SSE1-NEXT: movl %ebx, 24(%rax) -; CHECK-SSE1-NEXT: movl %r15d, 20(%rax) -; CHECK-SSE1-NEXT: movl %r11d, 16(%rax) -; CHECK-SSE1-NEXT: movl %r10d, 12(%rax) -; CHECK-SSE1-NEXT: movl %r9d, 8(%rax) -; CHECK-SSE1-NEXT: movl %r8d, 4(%rax) -; CHECK-SSE1-NEXT: movl %ecx, (%rax) +; CHECK-SSE1-NEXT: xorl %r10d, %edi +; CHECK-SSE1-NEXT: andl 20(%rcx), %edi +; CHECK-SSE1-NEXT: xorl %r10d, %edi +; CHECK-SSE1-NEXT: movl 24(%rsi), %edx +; CHECK-SSE1-NEXT: xorl %r9d, %edx +; CHECK-SSE1-NEXT: andl 24(%rcx), %edx +; CHECK-SSE1-NEXT: xorl %r9d, %edx +; CHECK-SSE1-NEXT: movl 28(%rsi), %esi +; CHECK-SSE1-NEXT: xorl %r8d, %esi +; CHECK-SSE1-NEXT: andl 28(%rcx), %esi +; CHECK-SSE1-NEXT: xorl %r8d, %esi +; CHECK-SSE1-NEXT: movl %esi, 28(%rax) +; CHECK-SSE1-NEXT: movl %edx, 24(%rax) +; CHECK-SSE1-NEXT: movl %edi, 20(%rax) +; CHECK-SSE1-NEXT: movl %ebx, 16(%rax) +; CHECK-SSE1-NEXT: movl %ebp, 12(%rax) +; CHECK-SSE1-NEXT: movl %r12d, 8(%rax) +; CHECK-SSE1-NEXT: movl %r15d, 4(%rax) +; CHECK-SSE1-NEXT: movl %r11d, (%rax) ; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r14 ; CHECK-SSE1-NEXT: popq %r15 ; CHECK-SSE1-NEXT: popq %rbp @@ -2426,72 +2187,60 @@ define <4 x i64> @out_v4i64(<4 x i64> *%px, <4 x i64> *%py, <4 x i64> *%pmask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i64: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movq (%rcx), %r8 -; CHECK-BASELINE-NEXT: movq 8(%rcx), %r9 -; CHECK-BASELINE-NEXT: movq 16(%rcx), %rdi -; CHECK-BASELINE-NEXT: movq 24(%rcx), %rcx -; CHECK-BASELINE-NEXT: movq 24(%rsi), %r10 -; CHECK-BASELINE-NEXT: andq %rcx, %r10 -; CHECK-BASELINE-NEXT: movq 16(%rsi), %r11 -; CHECK-BASELINE-NEXT: andq %rdi, %r11 -; CHECK-BASELINE-NEXT: movq 8(%rsi), %rbx -; CHECK-BASELINE-NEXT: andq %r9, %rbx -; CHECK-BASELINE-NEXT: movq (%rsi), %rsi -; CHECK-BASELINE-NEXT: andq %r8, %rsi -; CHECK-BASELINE-NEXT: notq %r8 -; CHECK-BASELINE-NEXT: notq %r9 -; CHECK-BASELINE-NEXT: notq %rdi -; CHECK-BASELINE-NEXT: notq %rcx -; CHECK-BASELINE-NEXT: andq 24(%rdx), %rcx -; CHECK-BASELINE-NEXT: orq %r10, %rcx -; CHECK-BASELINE-NEXT: andq 16(%rdx), %rdi -; CHECK-BASELINE-NEXT: orq %r11, %rdi -; CHECK-BASELINE-NEXT: andq 8(%rdx), %r9 -; CHECK-BASELINE-NEXT: orq %rbx, %r9 -; CHECK-BASELINE-NEXT: andq (%rdx), %r8 -; CHECK-BASELINE-NEXT: orq %rsi, %r8 -; CHECK-BASELINE-NEXT: movq %rcx, 24(%rax) -; CHECK-BASELINE-NEXT: movq %rdi, 16(%rax) -; CHECK-BASELINE-NEXT: movq %r9, 8(%rax) -; CHECK-BASELINE-NEXT: movq %r8, (%rax) -; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: movq 24(%rdx), %r8 +; CHECK-BASELINE-NEXT: movq 16(%rdx), %r9 +; CHECK-BASELINE-NEXT: movq (%rdx), %rdi +; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 +; CHECK-BASELINE-NEXT: movq (%rsi), %r11 +; CHECK-BASELINE-NEXT: xorq %rdi, %r11 +; CHECK-BASELINE-NEXT: andq (%rcx), %r11 +; CHECK-BASELINE-NEXT: xorq %rdi, %r11 +; CHECK-BASELINE-NEXT: movq 8(%rsi), %rdi +; CHECK-BASELINE-NEXT: xorq %r10, %rdi +; CHECK-BASELINE-NEXT: andq 8(%rcx), %rdi +; CHECK-BASELINE-NEXT: xorq %r10, %rdi +; CHECK-BASELINE-NEXT: movq 16(%rsi), %rdx +; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: andq 16(%rcx), %rdx +; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi +; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi +; CHECK-BASELINE-NEXT: xorq %r8, %rsi +; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) +; CHECK-BASELINE-NEXT: movq %rdx, 16(%rax) +; CHECK-BASELINE-NEXT: movq %rdi, 8(%rax) +; CHECK-BASELINE-NEXT: movq %r11, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i64: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movq (%rcx), %r8 -; CHECK-SSE1-NEXT: movq 8(%rcx), %r9 -; CHECK-SSE1-NEXT: movq 16(%rcx), %rdi -; CHECK-SSE1-NEXT: movq 24(%rcx), %rcx -; CHECK-SSE1-NEXT: movq 24(%rsi), %r10 -; CHECK-SSE1-NEXT: andq %rcx, %r10 -; CHECK-SSE1-NEXT: movq 16(%rsi), %r11 -; CHECK-SSE1-NEXT: andq %rdi, %r11 -; CHECK-SSE1-NEXT: movq 8(%rsi), %rbx -; CHECK-SSE1-NEXT: andq %r9, %rbx -; CHECK-SSE1-NEXT: movq (%rsi), %rsi -; CHECK-SSE1-NEXT: andq %r8, %rsi -; CHECK-SSE1-NEXT: notq %r8 -; CHECK-SSE1-NEXT: notq %r9 -; CHECK-SSE1-NEXT: notq %rdi -; CHECK-SSE1-NEXT: notq %rcx -; CHECK-SSE1-NEXT: andq 24(%rdx), %rcx -; CHECK-SSE1-NEXT: orq %r10, %rcx -; CHECK-SSE1-NEXT: andq 16(%rdx), %rdi -; CHECK-SSE1-NEXT: orq %r11, %rdi -; CHECK-SSE1-NEXT: andq 8(%rdx), %r9 -; CHECK-SSE1-NEXT: orq %rbx, %r9 -; CHECK-SSE1-NEXT: andq (%rdx), %r8 -; CHECK-SSE1-NEXT: orq %rsi, %r8 -; CHECK-SSE1-NEXT: movq %rcx, 24(%rax) -; CHECK-SSE1-NEXT: movq %rdi, 16(%rax) -; CHECK-SSE1-NEXT: movq %r9, 8(%rax) -; CHECK-SSE1-NEXT: movq %r8, (%rax) -; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: movq 24(%rdx), %r8 +; CHECK-SSE1-NEXT: movq 16(%rdx), %r9 +; CHECK-SSE1-NEXT: movq (%rdx), %rdi +; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 +; CHECK-SSE1-NEXT: movq (%rsi), %r11 +; CHECK-SSE1-NEXT: xorq %rdi, %r11 +; CHECK-SSE1-NEXT: andq (%rcx), %r11 +; CHECK-SSE1-NEXT: xorq %rdi, %r11 +; CHECK-SSE1-NEXT: movq 8(%rsi), %rdi +; CHECK-SSE1-NEXT: xorq %r10, %rdi +; CHECK-SSE1-NEXT: andq 8(%rcx), %rdi +; CHECK-SSE1-NEXT: xorq %r10, %rdi +; CHECK-SSE1-NEXT: movq 16(%rsi), %rdx +; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: andq 16(%rcx), %rdx +; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi +; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi +; CHECK-SSE1-NEXT: xorq %r8, %rsi +; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) +; CHECK-SSE1-NEXT: movq %rdx, 16(%rax) +; CHECK-SSE1-NEXT: movq %rdi, 8(%rax) +; CHECK-SSE1-NEXT: movq %r11, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i64: