diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h --- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h +++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h @@ -114,7 +114,7 @@ const LiveRange *LR = nullptr; LiveRange::const_iterator LRI; ///< current position in LR ConstSegmentIter LiveUnionI; ///< current position in LiveUnion - SmallVector InterferingVRegs; + Optional> InterferingVRegs; bool CheckedFirstInterference = false; bool SeenAllInterferences = false; unsigned Tag = 0; @@ -124,7 +124,7 @@ const LiveIntervalUnion &NewLiveUnion) { LiveUnion = &NewLiveUnion; LR = &NewLR; - InterferingVRegs.clear(); + InterferingVRegs = None; CheckedFirstInterference = false; SeenAllInterferences = false; Tag = NewLiveUnion.getTag(); @@ -164,7 +164,7 @@ // Vector generated by collectInterferingVRegs. const SmallVectorImpl &interferingVRegs() const { - return InterferingVRegs; + return *InterferingVRegs; } }; diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp --- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp +++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp @@ -112,7 +112,7 @@ // Scan the vector of interfering virtual registers in this union. Assume it's // quite small. bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const { - return is_contained(InterferingVRegs, VirtReg); + return is_contained(*InterferingVRegs, VirtReg); } // Collect virtual registers in this union that interfere with this @@ -126,9 +126,12 @@ // unsigned LiveIntervalUnion::Query:: collectInterferingVRegs(unsigned MaxInterferingRegs) { + if (!InterferingVRegs) + InterferingVRegs.emplace(); + // Fast path return if we already have the desired information. - if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + if (SeenAllInterferences || InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); // Set up iterators on the first call. if (!CheckedFirstInterference) { @@ -157,14 +160,14 @@ LiveInterval *VReg = LiveUnionI.value(); if (VReg != RecentReg && !isSeenInterference(VReg)) { RecentReg = VReg; - InterferingVRegs.push_back(VReg); - if (InterferingVRegs.size() >= MaxInterferingRegs) - return InterferingVRegs.size(); + InterferingVRegs->push_back(VReg); + if (InterferingVRegs->size() >= MaxInterferingRegs) + return InterferingVRegs->size(); } // This LiveUnion segment is no longer interesting. if (!(++LiveUnionI).valid()) { SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } } @@ -185,7 +188,7 @@ LiveUnionI.advanceTo(LRI->start); } SeenAllInterferences = true; - return InterferingVRegs.size(); + return InterferingVRegs->size(); } void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc, diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -987,7 +987,7 @@ for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); - + Q.collectInterferingVRegs(); // Check if any interfering live range is heavier than MaxWeight. for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) { // Check if interference overlast the segment in interest. diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -26,12 +26,12 @@ ; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v14.2d, #0000000000000000 ; CHECK-NEXT: add x10, x10, :lo12:B+48 ; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // implicit-def: $q15 ; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 @@ -47,9 +47,9 @@ ; CHECK-NEXT: // implicit-def: $q23 ; CHECK-NEXT: // implicit-def: $q24 ; CHECK-NEXT: // implicit-def: $q25 +; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: // implicit-def: $q26 ; CHECK-NEXT: // implicit-def: $q27 -; CHECK-NEXT: // implicit-def: $q28 ; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q30 ; CHECK-NEXT: // implicit-def: $q31 @@ -58,11 +58,27 @@ ; CHECK-NEXT: // implicit-def: $q10 ; CHECK-NEXT: // implicit-def: $q11 ; CHECK-NEXT: // implicit-def: $q12 -; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov v2.16b, v15.16b ; CHECK-NEXT: ldr q15, [x8] +; CHECK-NEXT: mov v28.16b, v12.16b +; CHECK-NEXT: mov v12.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v8.16b +; CHECK-NEXT: mov v8.16b, v30.16b +; CHECK-NEXT: mov v30.16b, v27.16b +; CHECK-NEXT: mov v27.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v23.16b +; CHECK-NEXT: mov v23.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v3.16b +; CHECK-NEXT: mov v3.16b, v1.16b +; CHECK-NEXT: mov v1.16b, v14.16b ; CHECK-NEXT: ldr q14, [x12] ; CHECK-NEXT: ldr q0, [x10], #64 ; CHECK-NEXT: ldr x18, [x12] @@ -78,99 +94,115 @@ ; CHECK-NEXT: ldr x0, [x8] ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: mul x1, x13, x18 -; CHECK-NEXT: add v12.2d, v12.2d, v0.2d +; CHECK-NEXT: add v11.2d, v11.2d, v0.2d ; CHECK-NEXT: fmov d0, x1 ; CHECK-NEXT: mul x1, x12, x18 +; CHECK-NEXT: mov v14.16b, v1.16b +; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov v3.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v23.16b +; CHECK-NEXT: mov v23.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v27.16b +; CHECK-NEXT: mov v27.16b, v30.16b +; CHECK-NEXT: mov v30.16b, v8.16b +; CHECK-NEXT: mov v8.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v12.16b +; CHECK-NEXT: mov v12.16b, v28.16b ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: mul x1, x17, x18 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d +; CHECK-NEXT: add v12.2d, v28.2d, v0.2d +; CHECK-NEXT: add v10.2d, v10.2d, v0.2d ; CHECK-NEXT: fmov d0, x1 ; CHECK-NEXT: mul x18, x16, x18 -; CHECK-NEXT: ldr q14, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], x18 ; CHECK-NEXT: mul x18, x15, x0 ; CHECK-NEXT: add x1, x11, x8 -; CHECK-NEXT: add v10.2d, v10.2d, v0.2d +; CHECK-NEXT: add v9.2d, v9.2d, v0.2d ; CHECK-NEXT: fmov d0, x18 ; CHECK-NEXT: mul x18, x14, x0 ; CHECK-NEXT: ldr x1, [x1, #128] ; CHECK-NEXT: mov v0.d[1], x18 ; CHECK-NEXT: mul x18, x13, x0 -; CHECK-NEXT: add v8.2d, v8.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v22.2d, v22.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d +; CHECK-NEXT: add v31.2d, v31.2d, v0.2d +; CHECK-NEXT: add v24.2d, v24.2d, v0.2d +; CHECK-NEXT: add v21.2d, v21.2d, v0.2d +; CHECK-NEXT: add v17.2d, v17.2d, v0.2d +; CHECK-NEXT: add v5.2d, v5.2d, v0.2d ; CHECK-NEXT: add v14.2d, v14.2d, v0.2d ; CHECK-NEXT: fmov d0, x18 ; CHECK-NEXT: mul x18, x12, x0 ; CHECK-NEXT: mov v0.d[1], x18 ; CHECK-NEXT: mul x18, x17, x0 ; CHECK-NEXT: mul x0, x16, x0 -; CHECK-NEXT: add v9.2d, v9.2d, v0.2d -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v26.2d, v26.2d, v0.2d -; CHECK-NEXT: add v23.2d, v23.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d -; CHECK-NEXT: add v19.2d, v19.2d, v0.2d -; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v5.2d, v5.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x15, x15, x1 -; CHECK-NEXT: mov v0.d[1], x0 -; CHECK-NEXT: mul x14, x14, x1 +; CHECK-NEXT: add v8.2d, v8.2d, v0.2d ; CHECK-NEXT: add v30.2d, v30.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d +; CHECK-NEXT: add v25.2d, v25.2d, v0.2d +; CHECK-NEXT: add v22.2d, v22.2d, v0.2d ; CHECK-NEXT: add v20.2d, v20.2d, v0.2d +; CHECK-NEXT: add v18.2d, v18.2d, v0.2d ; CHECK-NEXT: add v16.2d, v16.2d, v0.2d +; CHECK-NEXT: add v6.2d, v6.2d, v0.2d ; CHECK-NEXT: add v4.2d, v4.2d, v0.2d +; CHECK-NEXT: add v13.2d, v13.2d, v0.2d ; CHECK-NEXT: add v1.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov d0, x18 +; CHECK-NEXT: mov v15.16b, v2.16b +; CHECK-NEXT: mul x15, x15, x1 +; CHECK-NEXT: mov v0.d[1], x0 +; CHECK-NEXT: mul x14, x14, x1 +; CHECK-NEXT: add v29.2d, v29.2d, v0.2d +; CHECK-NEXT: add v23.2d, v23.2d, v0.2d +; CHECK-NEXT: add v19.2d, v19.2d, v0.2d +; CHECK-NEXT: add v7.2d, v7.2d, v0.2d +; CHECK-NEXT: add v3.2d, v3.2d, v0.2d +; CHECK-NEXT: add v15.2d, v2.2d, v0.2d ; CHECK-NEXT: fmov d0, x15 ; CHECK-NEXT: mul x13, x13, x1 ; CHECK-NEXT: mov v0.d[1], x14 ; CHECK-NEXT: mul x12, x12, x1 -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d +; CHECK-NEXT: add v27.2d, v27.2d, v0.2d ; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: ldr q28, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mul x17, x17, x1 ; CHECK-NEXT: mov v0.d[1], x12 ; CHECK-NEXT: mul x16, x16, x1 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d +; CHECK-NEXT: add v26.2d, v26.2d, v0.2d ; CHECK-NEXT: fmov d0, x17 ; CHECK-NEXT: mov v0.d[1], x16 +; CHECK-NEXT: add v28.2d, v28.2d, v0.2d +; CHECK-NEXT: str q28, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: add x8, x8, #8 // =8 -; CHECK-NEXT: add v27.2d, v27.2d, v0.2d ; CHECK-NEXT: cmp x8, #64 // =64 ; CHECK-NEXT: add x9, x9, #1 // =1 -; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: stp q13, q12, [x8] -; CHECK-NEXT: stp q11, q10, [x8, #32] -; CHECK-NEXT: stp q9, q8, [x8, #64] +; CHECK-NEXT: stp q12, q11, [x8] +; CHECK-NEXT: stp q10, q9, [x8, #32] +; CHECK-NEXT: stp q8, q31, [x8, #64] +; CHECK-NEXT: stp q3, q13, [x8, #432] +; CHECK-NEXT: stp q14, q1, [x8, #464] +; CHECK-NEXT: str q15, [x8, #496] ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: stp q31, q30, [x8, #96] -; CHECK-NEXT: stp q29, q28, [x8, #144] -; CHECK-NEXT: stp q27, q26, [x8, #176] -; CHECK-NEXT: str q25, [x8, #208] -; CHECK-NEXT: stp q24, q23, [x8, #240] -; CHECK-NEXT: stp q22, q21, [x8, #272] -; CHECK-NEXT: stp q20, q19, [x8, #304] -; CHECK-NEXT: stp q18, q17, [x8, #336] -; CHECK-NEXT: stp q16, q7, [x8, #368] -; CHECK-NEXT: stp q6, q5, [x8, #400] -; CHECK-NEXT: stp q4, q3, [x8, #432] -; CHECK-NEXT: stp q0, q2, [x8, #464] -; CHECK-NEXT: str q1, [x8, #496] +; CHECK-NEXT: stp q30, q29, [x8, #96] +; CHECK-NEXT: stp q27, q26, [x8, #144] +; CHECK-NEXT: stp q0, q25, [x8, #176] +; CHECK-NEXT: str q24, [x8, #208] +; CHECK-NEXT: stp q23, q22, [x8, #240] +; CHECK-NEXT: stp q21, q20, [x8, #272] +; CHECK-NEXT: stp q19, q18, [x8, #304] +; CHECK-NEXT: stp q17, q16, [x8, #336] +; CHECK-NEXT: stp q7, q6, [x8, #368] +; CHECK-NEXT: stp q5, q4, [x8, #400] ; CHECK-NEXT: add sp, sp, #80 // =80 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/X86/bug26810.ll b/llvm/test/CodeGen/X86/bug26810.ll --- a/llvm/test/CodeGen/X86/bug26810.ll +++ b/llvm/test/CodeGen/X86/bug26810.ll @@ -26,6 +26,7 @@ ; CHECK-NEXT: MOVAPSrm ; CHECK-NEXT: ADDPDrr ; CHECK-NEXT: MOVAPSmr +; CHECK-NEXT: MOVAPSrm ; CHECK-NEXT: ADD32ri8 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -390,25 +390,26 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: testl %esi, %esi ; X32-NEXT: je .LBB3_1 ; X32-NEXT: # %bb.2: # %bb26.preheader -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: xorl %ebx, %ebx ; X32-NEXT: xorl %eax, %eax ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_3: # %bb26 ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl (%edi,%ebx,8), %ebp -; X32-NEXT: movl 4(%edi,%ebx,8), %ecx -; X32-NEXT: addl (%esi,%ebx,8), %ebp -; X32-NEXT: adcl 4(%esi,%ebx,8), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx,%ebx,8), %ebp +; X32-NEXT: movl 4(%ecx,%ebx,8), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: addl (%edi,%ebx,8), %ebp +; X32-NEXT: adcl 4(%edi,%ebx,8), %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: incl %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: cmpl %esi, %ebx ; X32-NEXT: jb .LBB3_3 ; X32-NEXT: jmp .LBB3_4 ; X32-NEXT: .LBB3_1: diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -450,49 +450,51 @@ ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl %ebp, %eax -; CHECK-NEXT: imull %ecx, %eax +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: imull %eax, %edx ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrl $2, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je LBB1_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB1_6: ## %bb7.preheader -; CHECK-NEXT: ## =>This Loop Header: Depth=1 -; CHECK-NEXT: ## Child Loop BB1_4 Depth 2 +; CHECK-NEXT: movl %esi, %edx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_4: ## %bb6 -; CHECK-NEXT: ## Parent Loop BB1_6 Depth=1 -; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx ; CHECK-NEXT: movb %bl, (%edx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %edi, %esi ; CHECK-NEXT: jb LBB1_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 -; CHECK-NEXT: ## in Loop: Header=BB1_6 Depth=1 +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: addl %edi, %edx ; CHECK-NEXT: cmpl %ebp, %ecx -; CHECK-NEXT: jne LBB1_6 +; CHECK-NEXT: je LBB1_12 +; CHECK-NEXT: ## %bb.6: ## %bb7.preheader +; CHECK-NEXT: ## in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: jmp LBB1_4 ; CHECK-NEXT: LBB1_12: ## %bb18.loopexit ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -501,10 +503,10 @@ ; CHECK-NEXT: cmpl $1, %ebp ; CHECK-NEXT: jbe LBB1_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, {{[0-9]+}}(%esp) +; CHECK-NEXT: cmpl $2, %edi ; CHECK-NEXT: jb LBB1_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: shrl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: shrl %eax @@ -518,14 +520,14 @@ ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %ebx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill ; CHECK-NEXT: addl %edx, %ebx ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload @@ -543,26 +545,27 @@ ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: incl %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; CHECK-NEXT: incl %ebx ; CHECK-NEXT: addl %ebp, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload ; CHECK-NEXT: addl $2, %edx ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: cmpl $1, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: cmpl $1, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: je LBB1_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %edx +; CHECK-NEXT: cmpl $3, %esi ; CHECK-NEXT: jne LBB1_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: je LBB1_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph @@ -570,9 +573,11 @@ ; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx -; CHECK-NEXT: addl %eax, %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $15, %edx +; CHECK-NEXT: andl $-16, %edx +; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: leal (%edx,%eax), %ebp @@ -580,14 +585,16 @@ ; CHECK-NEXT: LBB1_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %ecx -; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %edi ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %edi, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %ebx, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_17 ; CHECK-NEXT: LBB1_18: ## %bb26 @@ -607,21 +614,24 @@ ; CHECK-NEXT: je LBB1_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 ; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: leal 15(%ecx), %ebx -; CHECK-NEXT: andl $-16, %ebx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: addl $15, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB1_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ecx, %ebx ; CHECK-NEXT: calll _memcpy -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ebx, %ecx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %ebx, %edi +; CHECK-NEXT: addl %edi, %ebp +; CHECK-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB1_21 ; CHECK-NEXT: LBB1_22: ## %bb33