diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1238,19 +1238,37 @@ /// Searches for a increment or decrement of \p Reg after \p MBBI. static MachineBasicBlock::iterator findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg, - ARMCC::CondCodes Pred, Register PredReg, int &Offset) { + ARMCC::CondCodes Pred, Register PredReg, int &Offset, + const TargetRegisterInfo *TRI) { Offset = 0; MachineBasicBlock &MBB = *MBBI->getParent(); MachineBasicBlock::iterator EndMBBI = MBB.end(); MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - // Skip debug values. - while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr()) - ++NextMBBI; - if (NextMBBI == EndMBBI) - return EndMBBI; + while (NextMBBI != EndMBBI) { + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugInstr()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + unsigned Off = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + if (Off) { + Offset = Off; + return NextMBBI; + } - Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); - return Offset == 0 ? EndMBBI : NextMBBI; + // SP can only be combined if it is the next instruction after the original + // MBBI, otherwise we may be incrementing the stack pointer (invalidating + // anything below the new pointer) when its frame elements are still in + // use. Other registers can attempt to look further, until a different use + // or def of the register is found. + if (Reg == ARM::SP || NextMBBI->readsRegister(Reg, TRI) || + NextMBBI->definesRegister(Reg, TRI)) + return EndMBBI; + + ++NextMBBI; + } + return EndMBBI; } /// Fold proceeding/trailing inc/dec of base register into the @@ -1296,7 +1314,7 @@ } else if (Mode == ARM_AM::ib && Offset == -Bytes) { Mode = ARM_AM::da; } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI); if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) { @@ -1483,7 +1501,7 @@ } else if (Offset == -Bytes) { NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI); if (Offset == Bytes) { NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); } else if (!isAM5 && Offset == -Bytes) { @@ -1614,7 +1632,7 @@ if (Offset == 8 || Offset == -8) { NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; } else { - MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset, TRI); if (Offset == 8 || Offset == -8) { NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; } else diff --git a/llvm/test/CodeGen/ARM/indexed-mem.ll b/llvm/test/CodeGen/ARM/indexed-mem.ll --- a/llvm/test/CodeGen/ARM/indexed-mem.ll +++ b/llvm/test/CodeGen/ARM/indexed-mem.ll @@ -220,16 +220,14 @@ define i32* @post_inc_ldrd(i32* %base, i32* %addr.3) { ; CHECK-V8M-LABEL: post_inc_ldrd: ; CHECK-V8M: @ %bb.0: -; CHECK-V8M-NEXT: ldrd r2, r3, [r0] -; CHECK-V8M-NEXT: adds r0, #8 +; CHECK-V8M-NEXT: ldrd r2, r3, [r0], #8 ; CHECK-V8M-NEXT: add r2, r3 ; CHECK-V8M-NEXT: str r2, [r1] ; CHECK-V8M-NEXT: bx lr ; ; CHECK-V8A-LABEL: post_inc_ldrd: ; CHECK-V8A: @ %bb.0: -; CHECK-V8A-NEXT: ldm r0, {r2, r3} -; CHECK-V8A-NEXT: add r0, r0, #8 +; CHECK-V8A-NEXT: ldm r0!, {r2, r3} ; CHECK-V8A-NEXT: add r2, r2, r3 ; CHECK-V8A-NEXT: str r2, [r1] ; CHECK-V8A-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -82,13 +82,10 @@ ; CHECK-NEXT: add.w r0, r0, r3, lsl #2 ; CHECK-NEXT: .LBB0_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r1] -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vldr s2, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vldmia r1!, {s0} +; CHECK-NEXT: vldmia r2!, {s2} ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstmia r0!, {s0} ; CHECK-NEXT: le lr, .LBB0_10 ; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -43,14 +43,11 @@ ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r5] -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: vldmia r6!, {s0} ; CHECK-NEXT: add.w r12, r12, #1 +; CHECK-NEXT: vldmia r5!, {s2} ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7] -; CHECK-NEXT: adds r7, #4 +; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit ; CHECK-NEXT: cmp r4, #3 @@ -261,14 +258,11 @@ ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r5] -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: vldmia r6!, {s0} ; CHECK-NEXT: add.w r12, r12, #1 +; CHECK-NEXT: vldmia r5!, {s2} ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7] -; CHECK-NEXT: adds r7, #4 +; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB1_6 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit ; CHECK-NEXT: cmp r4, #3 @@ -479,14 +473,11 @@ ; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r5] -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: vldmia r6!, {s0} ; CHECK-NEXT: add.w r12, r12, #1 +; CHECK-NEXT: vldmia r5!, {s2} ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7] -; CHECK-NEXT: adds r7, #4 +; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB2_6 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit ; CHECK-NEXT: cmp r4, #3 @@ -706,13 +697,11 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r4, [r6], #4 ; CHECK-NEXT: add.w r12, r12, #1 -; CHECK-NEXT: vldr s2, [r5] -; CHECK-NEXT: adds r5, #4 +; CHECK-NEXT: vldmia r5!, {s2} ; CHECK-NEXT: vmov s0, r4 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7] -; CHECK-NEXT: adds r7, #4 +; CHECK-NEXT: vstmia r7!, {s0} ; CHECK-NEXT: le lr, .LBB3_9 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit ; CHECK-NEXT: cmp.w r8, #3 @@ -1025,8 +1014,7 @@ ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vmul.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} @@ -1140,8 +1128,7 @@ ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vadd.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB6_7 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} @@ -1255,8 +1242,7 @@ ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vsub.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} @@ -1376,8 +1362,7 @@ ; CHECK-NEXT: vcvt.f16.s32 s2, s2 ; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB8_7 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1442,8 +1442,7 @@ ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: vfma.f32 q5, q4, r5 ; CHECK-NEXT: vfma.f32 q3, q5, q2 -; CHECK-NEXT: vstmia r7, {s20, s21} -; CHECK-NEXT: adds r7, #8 +; CHECK-NEXT: vstmia r7!, {s20, s21} ; CHECK-NEXT: vfma.f32 q3, q4, q1 ; CHECK-NEXT: vstrw.32 q3, [r4] ; CHECK-NEXT: le lr, .LBB17_3 @@ -2069,8 +2068,7 @@ ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrd r7, r4, [r1] -; CHECK-NEXT: adds r1, #8 +; CHECK-NEXT: ldrd r7, r4, [r1], #8 ; CHECK-NEXT: vfma.f32 q6, q3, r7 ; CHECK-NEXT: vmov r7, s24 ; CHECK-NEXT: vmov q1, q6 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -309,14 +309,11 @@ ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: .LBB2_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r0] -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vldr s2, [r1] -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vldmia r0!, {s0} +; CHECK-NEXT: vldmia r1!, {s2} ; CHECK-NEXT: vldr s4, [r2] ; CHECK-NEXT: vfma.f32 s4, s2, s0 -; CHECK-NEXT: vstr s4, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s4} ; CHECK-NEXT: le lr, .LBB2_7 ; CHECK-NEXT: .LBB2_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -44,14 +44,11 @@ ; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r0] -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vldr s2, [r1] -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vldmia r0!, {s0} +; CHECK-NEXT: vldmia r1!, {s2} ; CHECK-NEXT: vldr s4, [r2] ; CHECK-NEXT: vfma.f32 s4, s2, s0 -; CHECK-NEXT: vstr s4, [r2] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s4} ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -38,12 +38,10 @@ ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r5, r4, [r0] +; CHECK-NEXT: ldrd r5, r4, [r0], #8 ; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: ldrd r8, r7, [r1] -; CHECK-NEXT: adds r0, #8 +; CHECK-NEXT: ldrd r8, r7, [r1], #8 ; CHECK-NEXT: smull r4, r7, r7, r4 -; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: smull r6, r5, r8, r5 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 @@ -95,8 +93,7 @@ ; CHECK-NEXT: vorr q2, q2, q4 ; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: strd r4, r3, [r2] -; CHECK-NEXT: adds r2, #8 +; CHECK-NEXT: strd r4, r3, [r2], #8 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload @@ -744,10 +741,8 @@ ; CHECK-NEXT: add.w r12, r0, r5, lsl #2 ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r7, [r0] -; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldrd r5, r10, [r1] -; CHECK-NEXT: adds r1, #8 +; CHECK-NEXT: ldrd r4, r7, [r0], #8 +; CHECK-NEXT: ldrd r5, r10, [r1], #8 ; CHECK-NEXT: umull r4, r5, r5, r4 ; CHECK-NEXT: lsrl r4, r5, #31 ; CHECK-NEXT: subs.w r6, r4, #-1 @@ -773,8 +768,7 @@ ; CHECK-NEXT: vorn q0, q1, q0 ; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: strd r5, r4, [r2] -; CHECK-NEXT: adds r2, #8 +; CHECK-NEXT: strd r5, r4, [r2], #8 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -521,8 +521,7 @@ ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 ; CHECK-NEXT: .LBB5_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldmia r0!, {s2} ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB5_8 ; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup @@ -620,8 +619,7 @@ ; CHECK-NEXT: add.w r0, r0, r2, lsl #2 ; CHECK-NEXT: .LBB6_8: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vldmia r0!, {s2} ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: le lr, .LBB6_8 ; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -176,8 +176,7 @@ ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmul.f32 s0, s0, s0 ; CHECK-NEXT: vfma.f32 s0, s2, s2 -; CHECK-NEXT: vstr s0, [r12] -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vstmia r12!, {s0} ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %while.end ; CHECK-NEXT: pop {r4, r5, r7, pc}