Index: lib/CodeGen/PHIElimination.cpp =================================================================== --- lib/CodeGen/PHIElimination.cpp +++ lib/CodeGen/PHIElimination.cpp @@ -56,6 +56,11 @@ SplitAllCriticalEdges("phi-elim-split-all-critical-edges", cl::init(false), cl::Hidden, cl::desc("Split all critical edges during " "PHI elimination")); +static cl::opt +BackedgeSplitThreshold("phi-elim-backedge-split-threshold", cl::init(2), + cl::Hidden, cl::desc("Number of copies necessary to " + "allow splitting a loop " + "backedge (0=never)")); static cl::opt NoPhiElimLiveOutEarlyExit( "no-phi-elim-live-out-early-exit", cl::init(false), cl::Hidden, @@ -563,6 +568,8 @@ const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr; bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader(); + bool HasSingleLatch = CurLoop && CurLoop->getLoopLatch(); + unsigned NumBackedgeSplits = 0; bool Changed = false; for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end(); @@ -574,14 +581,6 @@ if (PreMBB->succ_size() == 1) continue; - // Avoid splitting backedges of loops. It would introduce small - // out-of-line blocks into the loop which is very bad for code placement. - if (PreMBB == &MBB && !SplitAllCriticalEdges) - continue; - const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : nullptr; - if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges) - continue; - // LV doesn't consider a phi use live-out, so isLiveOut only returns true // when the source register is live-out for some other reason than a phi // use. That means the copy we will insert in PreMBB won't be a kill, and @@ -608,6 +607,7 @@ ShouldSplit = ShouldSplit && !isLiveIn(Reg, &MBB); // Check for a loop exiting edge. + const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : nullptr; if (!ShouldSplit && CurLoop != PreLoop) { LLVM_DEBUG({ dbgs() << "Split wouldn't help, maybe avoid loop copies?\n"; @@ -624,6 +624,30 @@ } if (!ShouldSplit && !SplitAllCriticalEdges) continue; + + // Avoid splitting backedges of loops. It would introduce small + // out-of-line blocks into the loop which is very bad for code + // placement. + // + // However, allow splitting for loops with a single latch and multiple + // copies. The new block can be placed before the header, so the + // penalty is small, and it can reduce spilling. + // + // FIXME: We might need a better heuristic here? This helps in some + // cases, but not consistently. + if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges) { + if (!HasSingleLatch || !BackedgeSplitThreshold) { + LLVM_DEBUG(dbgs() << "Split disabled due to loop backedge\n"); + continue; + } + ++NumBackedgeSplits; + if (NumBackedgeSplits < BackedgeSplitThreshold) { + LLVM_DEBUG(dbgs() << "Delaying loop backedge split: " + << NumBackedgeSplits + << "\n"); + continue; + } + } if (!PreMBB->SplitCriticalEdge(&MBB, *this)) { LLVM_DEBUG(dbgs() << "Failed to split critical edge.\n"); continue; Index: test/CodeGen/Hexagon/swp-kernel-phi1.ll =================================================================== --- test/CodeGen/Hexagon/swp-kernel-phi1.ll +++ test/CodeGen/Hexagon/swp-kernel-phi1.ll @@ -7,7 +7,7 @@ ; vreg5 = phi(x, vreg4) is scheduled in stage 1, cycle 0 ; vreg4 = phi(y, z) is scheduled in stage 0, cycle 0 -; CHECK-DAG: :[[REG0:[0-9]+]]{{.*}} = {{.*}},#17 +; CHECK-DAG: r[[REG0:[0-9]+]]{{.*}} = combine(#17 ; CHECK-DAG: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: ; CHECK: r{{[0-9]+}} = sxth(r[[REG0]]) Index: test/CodeGen/Thumb/phi-backedge-split.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb/phi-backedge-split.ll @@ -0,0 +1,252 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv6m-unknown-unknown-eabi" + +; This isn't really a great test... but I'm not sure how to write a better +; one. The important thing to note here is note here is the number of spills +; inside the loop; breaking the backedge reduces them substantially. + +define dso_local void @a(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32* %arg8, i32* %arg9) { +; CHECK-LABEL: a: +; CHECK: @ %bb.0: @ %bb +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #60 +; CHECK-NEXT: sub sp, #60 +; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #52] @ 4-byte Spill +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: ldr r0, [sp, #100] +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: str r0, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #88] +; CHECK-NEXT: ldr r0, [sp, #84] +; CHECK-NEXT: str r0, [sp, #48] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #80] +; CHECK-NEXT: str r0, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_1: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: ldr r5, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: ldr r4, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: str r4, [sp, #48] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r4, r6 +; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #52] @ 4-byte Spill +; CHECK-NEXT: .LBB0_2: @ %bb10 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str r5, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: adds r1, #14 +; CHECK-NEXT: movs r0, #15 +; CHECK-NEXT: ands r1, r0 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [r6, r1] +; CHECK-NEXT: movs r1, #19 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: rors r2, r1 +; CHECK-NEXT: lsrs r1, r5, #10 +; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: movs r2, #17 +; CHECK-NEXT: rors r5, r2 +; CHECK-NEXT: eors r5, r1 +; CHECK-NEXT: adds r1, r4, #1 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ands r1, r0 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: ldr r1, [r6, r1] +; CHECK-NEXT: movs r7, #18 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: rors r3, r7 +; CHECK-NEXT: lsrs r7, r1, #3 +; CHECK-NEXT: eors r7, r3 +; CHECK-NEXT: movs r3, #7 +; CHECK-NEXT: rors r1, r3 +; CHECK-NEXT: eors r1, r7 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: adds r3, #9 +; CHECK-NEXT: ands r3, r0 +; CHECK-NEXT: lsls r3, r3, #2 +; CHECK-NEXT: ldr r3, [r6, r3] +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: ands r4, r0 +; CHECK-NEXT: lsls r3, r4, #2 +; CHECK-NEXT: ldr r0, [r6, r3] +; CHECK-NEXT: adds r0, r1, r0 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldm r1!, {r4} +; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str r0, [r6, r3] +; CHECK-NEXT: movs r1, #11 +; CHECK-NEXT: ldr r6, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: rors r3, r1 +; CHECK-NEXT: movs r1, #6 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: rors r5, r1 +; CHECK-NEXT: eors r5, r3 +; CHECK-NEXT: movs r1, #25 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: rors r3, r1 +; CHECK-NEXT: eors r3, r5 +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r2, r1 +; CHECK-NEXT: bics r1, r6 +; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: ldr r6, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: ands r5, r6 +; CHECK-NEXT: orrs r5, r1 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r5, r1 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adds r1, r1, r4 +; CHECK-NEXT: adds r4, r1, r0 +; CHECK-NEXT: ldr r0, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ands r1, r3 +; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: ldr r7, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: ands r0, r7 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: movs r1, #13 +; CHECK-NEXT: mov r6, r7 +; CHECK-NEXT: rors r6, r1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: mov r5, r7 +; CHECK-NEXT: rors r5, r1 +; CHECK-NEXT: eors r5, r6 +; CHECK-NEXT: ldr r6, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: movs r1, #22 +; CHECK-NEXT: rors r7, r1 +; CHECK-NEXT: eors r7, r5 +; CHECK-NEXT: adds r0, r7, r0 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r4, r1 +; CHECK-NEXT: cmp r6, #64 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.3: @ %bb95 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: str r1, [r4] +; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: str r1, [r4, #4] +; CHECK-NEXT: ldr r1, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: str r1, [r4, #8] +; CHECK-NEXT: str r2, [r4, #12] +; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: bl use +; CHECK-NEXT: add sp, #60 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +bb: + br label %bb10 + +bb10: + %tmp = phi i32 [ %arg6, %bb ], [ %tmp12, %bb10 ] + %tmp11 = phi i32 [ %arg7, %bb ], [ %tmp, %bb10 ] + %tmp12 = phi i32 [ %arg5, %bb ], [ %tmp13, %bb10 ] + %tmp13 = phi i32 [ %arg4, %bb ], [ %tmp92, %bb10 ] + %tmp14 = phi i32 [ 0, %bb ], [ %tmp19, %bb10 ] + %tmp15 = phi i32 [ %arg3, %bb ], [ %tmp16, %bb10 ] + %tmp16 = phi i32 [ %arg2, %bb ], [ %tmp17, %bb10 ] + %tmp17 = phi i32 [ %arg1, %bb ], [ %tmp18, %bb10 ] + %tmp18 = phi i32 [ %arg, %bb ], [ %tmp93, %bb10 ] + %tmp19 = add nuw nsw i32 %tmp14, 1 + %tmp20 = and i32 %tmp19, 15 + %tmp21 = getelementptr inbounds i32, i32* %arg9, i32 %tmp20 + %tmp22 = load i32, i32* %tmp21, align 4 + %tmp23 = lshr i32 %tmp22, 7 + %tmp24 = shl i32 %tmp22, 25 + %tmp25 = or i32 %tmp23, %tmp24 + %tmp26 = lshr i32 %tmp22, 18 + %tmp27 = shl i32 %tmp22, 14 + %tmp28 = or i32 %tmp26, %tmp27 + %tmp29 = lshr i32 %tmp22, 3 + %tmp30 = xor i32 %tmp28, %tmp29 + %tmp31 = xor i32 %tmp30, %tmp25 + %tmp32 = add nuw nsw i32 %tmp14, 14 + %tmp33 = and i32 %tmp32, 15 + %tmp34 = getelementptr inbounds i32, i32* %arg9, i32 %tmp33 + %tmp35 = load i32, i32* %tmp34, align 4 + %tmp36 = lshr i32 %tmp35, 17 + %tmp37 = shl i32 %tmp35, 15 + %tmp38 = or i32 %tmp36, %tmp37 + %tmp39 = lshr i32 %tmp35, 19 + %tmp40 = shl i32 %tmp35, 13 + %tmp41 = or i32 %tmp39, %tmp40 + %tmp42 = lshr i32 %tmp35, 10 + %tmp43 = xor i32 %tmp41, %tmp42 + %tmp44 = xor i32 %tmp43, %tmp38 + %tmp45 = lshr i32 %tmp13, 6 + %tmp46 = shl i32 %tmp13, 26 + %tmp47 = or i32 %tmp45, %tmp46 + %tmp48 = lshr i32 %tmp13, 11 + %tmp49 = shl i32 %tmp13, 21 + %tmp50 = or i32 %tmp48, %tmp49 + %tmp51 = xor i32 %tmp47, %tmp50 + %tmp52 = lshr i32 %tmp13, 25 + %tmp53 = shl i32 %tmp13, 7 + %tmp54 = or i32 %tmp52, %tmp53 + %tmp55 = xor i32 %tmp51, %tmp54 + %tmp56 = and i32 %tmp13, %tmp12 + %tmp57 = xor i32 %tmp13, -1 + %tmp58 = and i32 %tmp, %tmp57 + %tmp59 = or i32 %tmp56, %tmp58 + %tmp60 = getelementptr inbounds i32, i32* %arg8, i32 %tmp14 + %tmp61 = load i32, i32* %tmp60, align 4 + %tmp62 = add nuw nsw i32 %tmp14, 9 + %tmp63 = and i32 %tmp62, 15 + %tmp64 = getelementptr inbounds i32, i32* %arg9, i32 %tmp63 + %tmp65 = load i32, i32* %tmp64, align 4 + %tmp66 = and i32 %tmp14, 15 + %tmp67 = getelementptr inbounds i32, i32* %arg9, i32 %tmp66 + %tmp68 = load i32, i32* %tmp67, align 4 + %tmp69 = add i32 %tmp31, %tmp65 + %tmp70 = add i32 %tmp69, %tmp68 + %tmp71 = add i32 %tmp70, %tmp44 + store i32 %tmp71, i32* %tmp67, align 4 + %tmp72 = add i32 %tmp59, %tmp11 + %tmp73 = add i32 %tmp72, %tmp55 + %tmp74 = add i32 %tmp73, %tmp61 + %tmp75 = add i32 %tmp74, %tmp71 + %tmp76 = lshr i32 %tmp18, 2 + %tmp77 = shl i32 %tmp18, 30 + %tmp78 = or i32 %tmp76, %tmp77 + %tmp79 = lshr i32 %tmp18, 13 + %tmp80 = shl i32 %tmp18, 19 + %tmp81 = or i32 %tmp79, %tmp80 + %tmp82 = xor i32 %tmp78, %tmp81 + %tmp83 = lshr i32 %tmp18, 22 + %tmp84 = shl i32 %tmp18, 10 + %tmp85 = or i32 %tmp83, %tmp84 + %tmp86 = xor i32 %tmp82, %tmp85 + %tmp87 = xor i32 %tmp17, %tmp16 + %tmp88 = and i32 %tmp18, %tmp87 + %tmp89 = and i32 %tmp17, %tmp16 + %tmp90 = xor i32 %tmp88, %tmp89 + %tmp91 = add i32 %tmp86, %tmp90 + %tmp92 = add i32 %tmp75, %tmp15 + %tmp93 = add i32 %tmp91, %tmp75 + %tmp94 = icmp eq i32 %tmp19, 64 + br i1 %tmp94, label %bb95, label %bb10 + +bb95: + tail call void @use(i32 %tmp93, i32 %tmp18, i32 %tmp17, i32 %tmp16, i32 %tmp92, i32 %tmp13, i32 %tmp12, i32 %tmp) + ret void +} + +declare dso_local void @use(i32, i32, i32, i32, i32, i32, i32, i32) Index: test/CodeGen/X86/bug26810.ll =================================================================== --- test/CodeGen/X86/bug26810.ll +++ test/CodeGen/X86/bug26810.ll @@ -21,11 +21,9 @@ ; CHECK-LABEL: name: loop ; CHECK: bb.2.for.body: ; CHECK: SUBPDrr -; CHECK-NEXT: MOVAPSmr +; CHECK-NEXT: COPY ; CHECK-NEXT: MULPDrm -; CHECK-NEXT: MOVAPSrm -; CHECK-NEXT: ADDPDrr -; CHECK-NEXT: MOVAPSmr +; CHECK-NEXT: ADDPDrm ; CHECK-NEXT: ADD32ri8 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -198,28 +198,30 @@ ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 +; SSE2-NEXT: pmaddwd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 +; SSE2-NEXT: pmaddwd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -352,42 +354,46 @@ ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm7, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 -; SSE2-NEXT: pmaddwd %xmm9, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm7 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 +; SSE2-NEXT: pmaddwd %xmm6, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 +; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 +; SSE2-NEXT: pmaddwd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm4 +; SSE2-NEXT: pmaddwd %xmm5, %xmm4 +; SSE2-NEXT: paddd %xmm9, %xmm4 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm8, %xmm4 ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm2 +; SSE2-NEXT: paddd %xmm8, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 ; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -775,36 +781,38 @@ ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB6_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: pmaddwd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pmaddwd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -941,45 +949,49 @@ ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm7 ; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: pmaddwd %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm5, %xmm4 ; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm9 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmaddwd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: pmaddwd %xmm7, %xmm5 +; SSE2-NEXT: paddd %xmm3, %xmm5 +; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: pmaddwd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm9, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 @@ -987,12 +999,12 @@ ; SSE2-NEXT: paddd %xmm8, %xmm4 ; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm5 +; SSE2-NEXT: paddd %xmm3, %xmm5 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1274,16 +1286,18 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB9_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pmulhuw %xmm2, %xmm4 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhuw %xmm0, %xmm4 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB9_1 @@ -1389,46 +1403,50 @@ ; SSE2-LABEL: test_unsigned_short_512: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB10_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm8 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm6 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pmulhuw %xmm4, %xmm5 -; SSE2-NEXT: pmullw %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm1 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm7 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhuw %xmm1, %xmm4 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pmulhuw %xmm8, %xmm4 -; SSE2-NEXT: pmullw %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pmulhuw %xmm7, %xmm4 +; SSE2-NEXT: pmullw %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: paddd %xmm8, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_unsigned_short_512: @@ -1574,71 +1592,79 @@ ; SSE2-LABEL: test_unsigned_short_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB11_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm14 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm1 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhuw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: paddd %xmm2, %xmm10 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm2 +; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pmulhuw %xmm1, %xmm4 +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm7 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pmulhuw %xmm1, %xmm5 +; SSE2-NEXT: pmullw %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: paddd %xmm14, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm11 +; SSE2-NEXT: pmulhuw %xmm10, %xmm11 +; SSE2-NEXT: pmullw %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: paddd %xmm9, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: paddd %xmm8, %xmm6 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB11_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm7, %xmm10 -; SSE2-NEXT: paddd %xmm3, %xmm10 -; SSE2-NEXT: paddd %xmm4, %xmm8 -; SSE2-NEXT: paddd %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm10, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,0,1] -; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm6, %xmm5 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -151,135 +151,127 @@ define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 +; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: movdqa %xmm4, %xmm13 +; SSE2-NEXT: movdqa %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm2 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm5, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE2-NEXT: psubd %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: psubd %xmm1, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psubd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE2-NEXT: psubd %xmm0, %xmm3 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE2-NEXT: psubd %xmm7, %xmm6 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm7 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: psubd %xmm10, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm11, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE2-NEXT: psubd %xmm11, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: psubd %xmm6, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; SSE2-NEXT: psubd %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm10, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: psubd %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm15, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm15 +; SSE2-NEXT: pxor %xmm7, %xmm15 +; SSE2-NEXT: paddd %xmm11, %xmm15 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: pxor %xmm7, %xmm5 +; SSE2-NEXT: paddd %xmm14, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm13 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: pxor %xmm7, %xmm6 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: paddd %xmm0, %xmm8 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm14 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm15, %xmm0 -; SSE2-NEXT: paddd %xmm14, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm13, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] -; SSE2-NEXT: paddd %xmm6, %xmm0 +; SSE2-NEXT: paddd %xmm6, %xmm5 +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm8, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -418,9 +410,9 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: subq $200, %rsp -; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: subq $184, %rsp ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 @@ -429,301 +421,279 @@ ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: jmp .LBB2_1 ; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB2_2: # in Loop: Header=BB2_1 Depth=1 +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm15, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa a+1072(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm12 +; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm10, %xmm8 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: psubd %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE2-NEXT: psubd %xmm10, %xmm12 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm13, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3],xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: psubd %xmm6, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: psubd %xmm11, %xmm13 +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa a+1056(%rax), %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm15, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE2-NEXT: psubd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: psubd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm8 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: psubd %xmm6, %xmm11 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm15 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: psubd %xmm2, %xmm3 +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE2-NEXT: psubd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: psubd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3],xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa b+1040(%rax), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: psubd %xmm6, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm12 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE2-NEXT: psubd %xmm13, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE2-NEXT: psubd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE2-NEXT: psubd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm12, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: paddd %xmm1, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm13 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: paddd (%rsp), %xmm2 # 16-byte Folded Reload ; SSE2-NEXT: movdqa %xmm15, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm15 ; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm9, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm14 +; SSE2-NEXT: psrad $31, %xmm14 +; SSE2-NEXT: paddd %xmm14, %xmm1 +; SSE2-NEXT: pxor %xmm14, %xmm1 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: addq $4, %rax +; SSE2-NEXT: jne .LBB2_2 +; SSE2-NEXT: # %bb.3: # %middle.block ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: paddd %xmm12, %xmm6 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm9, %xmm11 +; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm6, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm7 +; SSE2-NEXT: paddd %xmm3, %xmm7 ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] ; SSE2-NEXT: paddd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: addq $4, %rax -; SSE2-NEXT: jne .LBB2_1 -; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $200, %rsp +; SSE2-NEXT: addq $184, %rsp ; SSE2-NEXT: retq ; ; AVX1-LABEL: sad_avx64i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14 +; AVX1-NEXT: subq $120, %rsp +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14 ; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm11, %xmm11, %xmm11 ; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13 -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: jmp .LBB2_1 ; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB2_2: # in Loop: Header=BB2_1 Depth=1 +; AVX1-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -733,10 +703,9 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %ymm7, %ymm11 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -757,21 +726,19 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm3 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm8 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm9 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0 +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm10 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 @@ -785,85 +752,88 @@ ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpabsd %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7 +; AVX1-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7 ; AVX1-NEXT: vpabsd %xmm6, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpabsd %xmm5, %xmm2 -; AVX1-NEXT: vpaddd %xmm15, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 -; AVX1-NEXT: vpabsd %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm6 +; AVX1-NEXT: vpabsd %xmm10, %xmm1 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm14, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14 -; AVX1-NEXT: vpabsd %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-NEXT: vpabsd %xmm9, %xmm2 +; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 +; AVX1-NEXT: vpabsd %xmm8, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm0 +; AVX1-NEXT: vpaddd %xmm12, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 +; AVX1-NEXT: vpabsd %xmm4, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm14, %xmm1, %xmm1 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14 +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm15, %xmm1, %xmm1 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1 ; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 +; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm12, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm13, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm13 ; AVX1-NEXT: addq $4, %rax -; AVX1-NEXT: jne .LBB2_1 -; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3 +; AVX1-NEXT: jne .LBB2_2 +; AVX1-NEXT: # %bb.3: # %middle.block +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1 -; AVX1-NEXT: vpaddd %xmm10, %xmm7, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpaddd %xmm13, %xmm12, %xmm1 +; AVX1-NEXT: vpaddd %xmm11, %xmm7, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: addq $120, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ;