diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h --- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h +++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h @@ -114,30 +114,30 @@ const LiveRange *LR = nullptr; LiveRange::const_iterator LRI; ///< current position in LR ConstSegmentIter LiveUnionI; ///< current position in LiveUnion - SmallVector InterferingVRegs; + Optional> InterferingVRegs; bool CheckedFirstInterference = false; bool SeenAllInterferences = false; unsigned Tag = 0; unsigned UserTag = 0; + public: + Query() = default; + Query(const LiveRange &LR, const LiveIntervalUnion &LIU) + : LiveUnion(&LIU), LR(&LR) {} + Query(const Query &) = delete; + Query &operator=(const Query &) = delete; + void reset(unsigned NewUserTag, const LiveRange &NewLR, const LiveIntervalUnion &NewLiveUnion) { LiveUnion = &NewLiveUnion; LR = &NewLR; - InterferingVRegs.clear(); + InterferingVRegs = None; CheckedFirstInterference = false; SeenAllInterferences = false; Tag = NewLiveUnion.getTag(); UserTag = NewUserTag; } - public: - Query() = default; - Query(const LiveRange &LR, const LiveIntervalUnion &LIU): - LiveUnion(&LIU), LR(&LR) {} - Query(const Query &) = delete; - Query &operator=(const Query &) = delete; - void init(unsigned NewUserTag, const LiveRange &NewLR, const LiveIntervalUnion &NewLiveUnion) { if (UserTag == NewUserTag && LR == &NewLR && LiveUnion == &NewLiveUnion && @@ -164,7 +164,7 @@ // Vector generated by collectInterferingVRegs. const SmallVectorImpl &interferingVRegs() const { - return InterferingVRegs; + return *InterferingVRegs; } }; diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp --- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -112,7 +112,7 @@ // Scan the vector of interfering virtual registers in this union. Assume it's // quite small. bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const { - return is_contained(InterferingVRegs, VirtReg); + return is_contained(*InterferingVRegs, VirtReg); } // Collect virtual registers in this union that interfere with this @@ -126,9 +126,12 @@ // unsigned LiveIntervalUnion::Query:: collectInterferingVRegs(unsigned MaxInterferingRegs) { + if (!InterferingVRegs) + InterferingVRegs.emplace(); + // Fast path return if we already have the desired information. - if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + if (SeenAllInterferences || InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); // Set up iterators on the first call. if (!CheckedFirstInterference) { @@ -157,14 +160,14 @@ LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { RecentReg = VReg; - InterferingVRegs.push_back(VReg); - if (InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + InterferingVRegs->push_back(VReg); + if (InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); } // This LiveUnion segment is no longer interesting. if (!(++LiveUnionI).valid()) { SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } } @@ -185,7 +188,7 @@ LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc, diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -216,7 +216,21 @@ // Check for interference with that segment for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - if (query(LR, *Units).checkInterference()) + // LR is stack-allocated. LiveRegMatrix caches queries by a key that + // includes the address of the live range. If (for the same reg unit) this + // checkInterference overload is called twice, without any other query() + // calls in between (on heap-allocated LiveRanges) - which would invalidate + // the cached query - the LR address seen the second time may well be the + // same as that seen the first time, while the Start/End/valno may not - yet + // the same cached result would be fetched. To avoid that, we don't cache + // this query. + // + // FIXME: the usability of the Query API needs to be improved to avoid + // subtle bugs due to query identity. Avoiding caching, for example, would + // greatly simplify things. + LiveIntervalUnion::Query Q; + Q.reset(UserTag, LR, Matrix[*Units]); + if (Q.checkInterference()) return true; } return false; diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -471,12 +471,13 @@ bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const; bool canEvictInterference(LiveInterval &, MCRegister, bool, EvictionCost &, const SmallVirtRegSet &) const; - bool canEvictInterferenceInRange(LiveInterval &VirtReg, MCRegister PhysReg, - SlotIndex Start, SlotIndex End, - EvictionCost &MaxCost) const; + bool canEvictInterferenceInRange(const LiveInterval &VirtReg, + MCRegister PhysReg, SlotIndex Start, + SlotIndex End, EvictionCost &MaxCost) const; MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order, - LiveInterval &VirtReg, SlotIndex Start, - SlotIndex End, float *BestEvictWeight); + const LiveInterval &VirtReg, + SlotIndex Start, SlotIndex End, + float *BestEvictWeight) const; void evictInterference(LiveInterval &, MCRegister, SmallVectorImpl &); bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg, @@ -979,7 +980,7 @@ /// \param MaxCost Only look for cheaper candidates and update with new cost /// when returning true. /// \return True when interference can be evicted cheaper than MaxCost. -bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg, +bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg, MCRegister PhysReg, SlotIndex Start, SlotIndex End, EvictionCost &MaxCost) const { @@ -987,6 +988,7 @@ for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + Q.collectInterferingVRegs(); // Check if any interfering live range is heavier than MaxWeight. for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) { @@ -1031,9 +1033,9 @@ /// \return The PhysReg which is the best candidate for eviction and the /// eviction cost in BestEvictweight MCRegister RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order, - LiveInterval &VirtReg, + const LiveInterval &VirtReg, SlotIndex Start, SlotIndex End, - float *BestEvictweight) { + float *BestEvictweight) const { EvictionCost BestEvictCost; BestEvictCost.setMax(); BestEvictCost.MaxWeight = VirtReg.weight(); @@ -1556,25 +1558,9 @@ return false; } - // Check if the local interval will evict a cheaper interval. - float CheapestEvictWeight = 0; - MCRegister FutureEvictedPhysReg = getCheapestEvicteeWeight( - Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(), - Cand.Intf.last(), &CheapestEvictWeight); - - // Have we found an interval that can be evicted? - if (FutureEvictedPhysReg) { - float splitArtifactWeight = - VRAI->futureWeight(LIS->getInterval(VirtRegToSplit), - Cand.Intf.first().getPrevIndex(), Cand.Intf.last()); - // Will the weight of the local interval be higher than the cheapest evictee - // weight? If so it will evict it and will not cause a spill. - if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight) - return false; - } - - // The local interval is not able to find non interferencing assignment and - // not able to evict a less worthy interval, therfore, it can cause a spill. + // The local interval is not able to find non interferencing assignment + // and not able to evict a less worthy interval, therfore, it can cause a + // spill. return true; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -557,7 +557,7 @@ bool enableEarlyIfConversion() const override; - bool enableAdvancedRASplitCost() const override { return true; } + bool enableAdvancedRASplitCost() const override { return false; } std::unique_ptr getCustomPBQPConstraints() const override; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -941,7 +941,7 @@ return TargetSubtargetInfo::ANTIDEP_CRITICAL; } - bool enableAdvancedRASplitCost() const override { return true; } + bool enableAdvancedRASplitCost() const override { return false; } }; } // end namespace llvm diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-arm-none-eabi < %s | FileCheck %s +; RUN: llc -consider-local-interval-cost -mtriple=aarch64-arm-none-eabi < %s | FileCheck %s @A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 @B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8 diff --git a/llvm/test/CodeGen/X86/bug26810.ll b/llvm/test/CodeGen/X86/bug26810.ll --- a/llvm/test/CodeGen/X86/bug26810.ll +++ b/llvm/test/CodeGen/X86/bug26810.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s +; RUN: llc -consider-local-interval-cost < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s ; Make sure bad eviction sequence doesnt occur ; Fix for bugzilla 26810. diff --git a/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll b/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll --- a/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll +++ b/llvm/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s +; RUN: llc -consider-local-interval-cost < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s ; Make sure bad eviction sequence doesnt occur ; Part of the fix for bugzilla 26810. diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -162,9 +162,9 @@ ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi ; X86-NOBMI-NEXT: movl %ebp, %esi -; X86-NOBMI-NEXT: xorl %ebx, %esi +; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NOBMI-NEXT: xorl %ebx, %edi ; X86-NOBMI-NEXT: orl %esi, %edi ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -390,25 +390,28 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testl %ecx, %ecx ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl (%edi,%ebx,8), %ebp +; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl 4(%edi,%ebx,8), %ecx -; X32-NEXT: addl (%esi,%ebx,8), %ebp -; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: addl (%edi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%edi,%ebx,8), %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: cmpl %esi, %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -450,49 +450,51 @@ ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl %ebp, %eax -; CHECK-NEXT: imull %ecx, %eax +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: imull %eax, %edx ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrl $2, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB1_6: ## %bb7.preheader -; CHECK-NEXT: ## =>This Loop Header: Depth=1 -; CHECK-NEXT: ## Child Loop BB1_4 Depth 2 +; CHECK-NEXT: movl %esi, %edx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_4: ## %bb6 -; CHECK-NEXT: ## Parent Loop BB1_6 Depth=1 -; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx ; CHECK-NEXT: movb %bl, (%edx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %edi, %esi ; CHECK-NEXT: jb LBB1_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 -; CHECK-NEXT: ## in Loop: Header=BB1_6 Depth=1 +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: addl %edi, %edx ; CHECK-NEXT: cmpl %ebp, %ecx -; CHECK-NEXT: jne LBB1_6 +; CHECK-NEXT: je LBB1_12 +; CHECK-NEXT: ## %bb.6: ## %bb7.preheader +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: jmp LBB1_4 ; CHECK-NEXT: LBB1_12: ## %bb18.loopexit ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -501,10 +503,10 @@ ; CHECK-NEXT: cmpl $1, %ebp ; CHECK-NEXT: jbe LBB1_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, {{[0-9]+}}(%esp) +; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jb LBB1_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: shrl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: shrl %eax @@ -518,14 +520,14 @@ ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %ebx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill ; CHECK-NEXT: addl %edx, %ebx ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload @@ -543,26 +545,27 @@ ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: incl %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; CHECK-NEXT: incl %ebx ; CHECK-NEXT: addl %ebp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload ; CHECK-NEXT: addl $2, %edx ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: cmpl $1, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: cmpl $1, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %edx +; CHECK-NEXT: cmpl $3, %esi ; CHECK-NEXT: jne LBB1_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: je LBB1_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph @@ -570,9 +573,11 @@ ; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx -; CHECK-NEXT: addl %eax, %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $15, %edx +; CHECK-NEXT: andl $-16, %edx +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: leal (%edx,%eax), %ebp @@ -580,14 +585,16 @@ ; CHECK-NEXT: LBB1_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %ecx -; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %edi ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %ebx, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_17 ; CHECK-NEXT: LBB1_18: ## %bb26 @@ -607,21 +614,24 @@ ; CHECK-NEXT: je LBB1_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 ; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: addl $15, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %ebx ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ebx, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %edi, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_21 ; CHECK-NEXT: LBB1_22: ## %bb33