Index: llvm/trunk/include/llvm/CodeGen/LiveRegMatrix.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/LiveRegMatrix.h +++ llvm/trunk/include/llvm/CodeGen/LiveRegMatrix.h @@ -107,6 +107,13 @@ /// with the highest enum value is returned. InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg); + /// Check for interference in the segment [Start, End) that may prevent + /// assignment to PhysReg. If this function returns true, there is + /// interference in the segment [Start, End) of some other interval already + /// assigned to PhysReg. If this function returns false, PhysReg is free at + /// the segment [Start, End). + bool checkInterference(SlotIndex Start, SlotIndex End, unsigned PhysReg); + /// Assign VirtReg to PhysReg. /// This will mark VirtReg's live range as occupied in the LiveRegMatrix and /// update VirtRegMap. The live range is expected to be available in PhysReg. Index: llvm/trunk/lib/CodeGen/LiveRegMatrix.cpp =================================================================== --- llvm/trunk/lib/CodeGen/LiveRegMatrix.cpp +++ llvm/trunk/lib/CodeGen/LiveRegMatrix.cpp @@ -205,3 +205,19 @@ return IK_Free; } + +bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, + unsigned PhysReg) { + // Construct artificial live range containing only one segment [Start, End). + VNInfo valno(0, Start); + LiveRange::Segment Seg(Start, End, &valno); + LiveRange LR; + LR.addSegment(Seg); + + // Check for interference with that segment + for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + if (query(LR, *Units).checkInterference()) + return true; + } + return false; +} Index: llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp +++ llvm/trunk/lib/CodeGen/RegAllocGreedy.cpp @@ -448,6 +448,9 @@ bool splitCanCauseEvictionChain(unsigned Evictee, GlobalSplitCandidate &Cand, unsigned BBNumber, const AllocationOrder &Order); + bool splitCanCauseLocalSpill(unsigned VirtRegToSplit, + GlobalSplitCandidate &Cand, unsigned BBNumber, + const AllocationOrder &Order); BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &, const AllocationOrder &Order, bool *CanCauseEvictionChain); @@ -1427,7 +1430,7 @@ /// we are splitting for and the interferences. /// \param BBNumber The number of a BB for which the region split process will /// create a local split interval. -/// \param Order The phisical registers that may get evicted by a split +/// \param Order The physical registers that may get evicted by a split /// artifact of Evictee. /// \return True if splitting Evictee may cause a bad eviction chain, false /// otherwise. @@ -1448,8 +1451,8 @@ getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee), Cand.Intf.first(), Cand.Intf.last(), &MaxWeight); - // The bad eviction chain occurs when either the split candidate the - // evited reg or one of the split artifact will evict the evicting reg. + // The bad eviction chain occurs when either the split candidate is the + // evicting reg or one of the split artifact will evict the evicting reg. if ((PhysReg != Cand.PhysReg) && (PhysReg != FutureEvictedPhysReg)) return false; @@ -1479,6 +1482,54 @@ return true; } +/// \brief Check if splitting VirtRegToSplit will create a local split interval +/// in basic block number BBNumber that may cause a spill. +/// +/// \param VirtRegToSplit The register considered to be split. +/// \param Cand The split candidate that determines the physical +/// register we are splitting for and the interferences. +/// \param BBNumber The number of a BB for which the region split process +/// will create a local split interval. +/// \param Order The physical registers that may get evicted by a +/// split artifact of VirtRegToSplit. +/// \return True if splitting VirtRegToSplit may cause a spill, false +/// otherwise. +bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit, + GlobalSplitCandidate &Cand, + unsigned BBNumber, + const AllocationOrder &Order) { + Cand.Intf.moveToBlock(BBNumber); + + // Check if the local interval will find a non interfereing assignment. + for (auto PhysReg : Order.getOrder()) { + if (!Matrix->checkInterference(Cand.Intf.first().getPrevIndex(), + Cand.Intf.last(), PhysReg)) + return false; + } + + // Check if the local interval will evict a cheaper interval. + float CheapestEvictWeight = 0; + unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight( + Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(), + Cand.Intf.last(), &CheapestEvictWeight); + + // Have we found an interval that can be evicted? + if (FutureEvictedPhysReg) { + VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis(), *MBFI); + float splitArtifactWeight = + VRAI.futureWeight(LIS->getInterval(VirtRegToSplit), + Cand.Intf.first().getPrevIndex(), Cand.Intf.last()); + // Will the weight of the local interval be higher than the cheapest evictee + // weight? If so it will evict it and will not cause a spill. + if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight) + return false; + } + + // The local interval is not able to find non interferening assignment and not + // able to evict a less worthy interval, therfore, it can cause a spill. + return true; +} + /// calcGlobalSplitCost - Return the global split cost of following the split /// pattern in LiveBundles. This cost should be added to the local cost of the /// interference pattern in SplitConstraints. @@ -1499,19 +1550,26 @@ Cand.Intf.moveToBlock(BC.Number); // Check wheather a local interval is going to be created during the region - // split. - if (EnableAdvancedRASplitCost && CanCauseEvictionChain && - Cand.Intf.hasInterference() && BI.LiveIn && BI.LiveOut && RegIn && - RegOut) { - - if (splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) { - // This interfernce cause our eviction from this assignment, we might - // evict somebody else, add that cost. + // split. Calculate adavanced spilt cost (cost of local intervals) if option + // is enabled. + if (EnableAdvancedRASplitCost && Cand.Intf.hasInterference() && BI.LiveIn && + BI.LiveOut && RegIn && RegOut) { + + if (CanCauseEvictionChain && + splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) { + // This interference causes our eviction from this assignment, we might + // evict somebody else and eventually someone will spill, add that cost. // See splitCanCauseEvictionChain for detailed description of scenarios. GlobalCost += SpillPlacer->getBlockFrequency(BC.Number); GlobalCost += SpillPlacer->getBlockFrequency(BC.Number); *CanCauseEvictionChain = true; + + } else if (splitCanCauseLocalSpill(VirtRegToSplit, Cand, BC.Number, + Order)) { + // This interference causes local interval to spill, add that cost. + GlobalCost += SpillPlacer->getBlockFrequency(BC.Number); + GlobalCost += SpillPlacer->getBlockFrequency(BC.Number); } } @@ -1540,7 +1598,7 @@ // region split. if (EnableAdvancedRASplitCost && CanCauseEvictionChain && splitCanCauseEvictionChain(VirtRegToSplit, Cand, Number, Order)) { - // This interfernce cause our eviction from this assignment, we might + // This interference cause our eviction from this assignment, we might // evict somebody else, add that cost. // See splitCanCauseEvictionChain for detailed description of // scenarios. Index: llvm/trunk/test/CodeGen/X86/bug26810.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bug26810.ll +++ llvm/trunk/test/CodeGen/X86/bug26810.ll @@ -22,9 +22,10 @@ ; CHECK: bb.2.for.body: ; CHECK: SUBPDrr ; CHECK-NEXT: MOVAPSmr -; CHECK-NEXT: MOVAPSrm ; CHECK-NEXT: MULPDrm +; CHECK-NEXT: MOVAPSrm ; CHECK-NEXT: ADDPDrr +; CHECK-NEXT: MOVAPSmr ; CHECK-NEXT: ADD32ri8 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" Index: llvm/trunk/test/CodeGen/X86/regalloc-advanced-split-cost.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/regalloc-advanced-split-cost.ll +++ llvm/trunk/test/CodeGen/X86/regalloc-advanced-split-cost.ll @@ -0,0 +1,89 @@ +; RUN: llc < %s -march=x86 -regalloc=greedy --debug-only=regalloc 2>&1 | FileCheck %s + +; This test is meant to make sure that the weight of local intervals that are +; created during split is taken into account when choosing the best candidate +; register. +; %shl is the interval that will be split. +; The inline assembly calls interfere with %shl and make only 2 available split +; candidates - %esi and %ebp. +; The old code would have chosen %esi as the split candidate ignoring the fact +; that this choice will cause the creation of a local interval that will have a +; certain spill cost. +; The new code choses %ebp as the split candidate as it has lower spill cost. + +; Make sure the split behaves as expected +; CHECK: RS_Split Cascade 1 +; CHECK-NOT: %eax static = +; CHECK: %eax no positive bundles +; CHECK-NEXT: %ecx no positive bundles +; CHECK-NEXT: %edx no positive bundles +; CHECK-NEXT: %esi static = +; CHECK-NEXT: %edi no positive bundles +; CHECK-NEXT: %ebx no positive bundles +; CHECK-NEXT: %ebp static = +; CHECK: Split for %ebp + +; Function Attrs: nounwind +define i32 @foo(i32* %array, i32 %cond1, i32 %val) local_unnamed_addr #0 { +entry: + %array.addr = alloca i32*, align 4 + store i32* %array, i32** %array.addr, align 4, !tbaa !3 + %0 = load i32, i32* %array, align 4, !tbaa !7 + %arrayidx1 = getelementptr inbounds i32, i32* %array, i32 1 + %1 = load i32, i32* %arrayidx1, align 4, !tbaa !7 + %arrayidx2 = getelementptr inbounds i32, i32* %array, i32 2 + %2 = load i32, i32* %arrayidx2, align 4, !tbaa !7 + %arrayidx3 = getelementptr inbounds i32, i32* %array, i32 3 + %3 = load i32, i32* %arrayidx3, align 4, !tbaa !7 + %arrayidx4 = getelementptr inbounds i32, i32* %array, i32 4 + %4 = load i32, i32* %arrayidx4, align 4, !tbaa !7 + %arrayidx6 = getelementptr inbounds i32, i32* %array, i32 %val + %5 = load i32, i32* %arrayidx6, align 4, !tbaa !7 + %shl = shl i32 %5, 5 + %tobool = icmp eq i32 %cond1, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx7 = getelementptr inbounds i32, i32* %array, i32 6 + store i32 %shl, i32* %arrayidx7, align 4, !tbaa !7 + call void asm "nop", "=*m,r,r,r,r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32** nonnull %array.addr) #1, !srcloc !9 + %6 = load i32*, i32** %array.addr, align 4, !tbaa !3 + %arrayidx8 = getelementptr inbounds i32, i32* %6, i32 7 + br label %if.end + +if.else: ; preds = %entry + %arrayidx5 = getelementptr inbounds i32, i32* %array, i32 5 + %7 = load i32, i32* %arrayidx5, align 4, !tbaa !7 + %arrayidx9 = getelementptr inbounds i32, i32* %array, i32 8 + store i32 %shl, i32* %arrayidx9, align 4, !tbaa !7 + call void asm "nop", "=*m,{ax},{bx},{cx},{dx},{di},{si},{ebp},*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %7, i32* undef, i32** nonnull %array.addr) #1, !srcloc !10 + %8 = load i32*, i32** %array.addr, align 4, !tbaa !3 + %arrayidx10 = getelementptr inbounds i32, i32* %8, i32 9 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %arrayidx10.sink = phi i32* [ %arrayidx10, %if.else ], [ %arrayidx8, %if.then ] + %9 = phi i32* [ %8, %if.else ], [ %6, %if.then ] + store i32 %shl, i32* %arrayidx10.sink, align 4, !tbaa !7 + %10 = load i32, i32* %9, align 4, !tbaa !7 + %add = add nsw i32 %10, %shl + ret i32 %add +} + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"NumRegisterParameters", i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 6.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"any pointer", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} +!9 = !{i32 268} +!10 = !{i32 390} Index: llvm/trunk/test/CodeGen/X86/sad.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sad.ll +++ llvm/trunk/test/CodeGen/X86/sad.ll @@ -148,21 +148,21 @@ ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm12, %xmm12 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -216,61 +216,65 @@ ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm13 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm15 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm8 ; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm14 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm15, %xmm0 ; SSE2-NEXT: paddd %xmm14, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm13, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] +; SSE2-NEXT: paddd %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -400,43 +404,43 @@ ; SSE2-NEXT: subq $200, %rsp ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 @@ -497,7 +501,7 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] ; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: psubd %xmm0, %xmm15 ; SSE2-NEXT: movdqa %xmm7, %xmm0 @@ -506,7 +510,7 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; SSE2-NEXT: psubd %xmm3, %xmm9 ; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] @@ -539,7 +543,7 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; SSE2-NEXT: psubd %xmm13, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, %xmm13 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 @@ -554,15 +558,13 @@ ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm4 @@ -570,7 +572,6 @@ ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm8 @@ -578,7 +579,6 @@ ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 @@ -587,55 +587,64 @@ ; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm15, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm15 ; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm12, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm12, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 @@ -643,32 +652,40 @@ ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm3, %xmm8 -; SSE2-NEXT: paddd %xmm2, %xmm15 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm8, %xmm13 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: paddd %xmm13, %xmm1 -; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]