Index: llvm/lib/Target/ARM/ARMMacroFusion.cpp =================================================================== --- llvm/lib/Target/ARM/ARMMacroFusion.cpp +++ llvm/lib/Target/ARM/ARMMacroFusion.cpp @@ -45,6 +45,14 @@ return false; } +// Fuse t2LoopDec and t2LoopEnd +static bool isLoopDecEndPair(const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + // Assume the 1st instr to be a wildcard if it is unspecified. + return (FirstMI == nullptr || FirstMI->getOpcode() == ARM::t2LoopDec) && + SecondMI.getOpcode() == ARM::t2LoopEnd; +} + /// Check if the instr pair, FirstMI and SecondMI, should be fused /// together. Given SecondMI, when FirstMI is unspecified, then check if /// SecondMI may be part of a fused pair at all. @@ -58,11 +66,13 @@ return true; if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI)) return true; + if (ST.hasLOB() && isLoopDecEndPair(FirstMI, SecondMI)) + return true; return false; } -std::unique_ptr createARMMacroFusionDAGMutation () { +std::unique_ptr createARMMacroFusionDAGMutation() { return createMacroFusionDAGMutation(shouldScheduleAdjacent); } Index: llvm/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.h +++ llvm/lib/Target/ARM/ARMSubtarget.h @@ -705,7 +705,7 @@ bool hasFuseAES() const { return HasFuseAES; } bool hasFuseLiterals() const { return HasFuseLiterals; } /// Return true if the CPU supports any kind of instruction fusion. - bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); } + bool hasFusion() const { return hasFuseAES() || hasFuseLiterals() || hasLOB(); } bool hasMatMulInt8() const { return HasMatMulInt8; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll @@ -58,14 +58,12 @@ ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: cmp lr, r5 ; CHECK-NEXT: mov lr, r10 -; CHECK-NEXT: sub.w lr, lr, #1 -; CHECK-NEXT: add.w r4, r4, #4 -; CHECK-NEXT: mov r10, lr -; CHECK-NEXT: it lt +; CHECK-NEXT: itt lt ; CHECK-NEXT: movlt r12, r5 -; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r7, r6 -; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: mov r10, lr ; CHECK-NEXT: bne .LBB0_5 ; CHECK-NEXT: b .LBB0_6 ; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa.loopexit @@ -229,17 +227,17 @@ ; CHECK-NEXT: sub.w r12, r6, #8 ; CHECK-NEXT: and r7, r2, #7 ; CHECK-NEXT: add.w r4, r3, r12, lsr #3 -; CHECK-NEXT: add.w r3, r0, r6, lsl #1 -; CHECK-NEXT: add.w r12, r1, r6, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 +; CHECK-NEXT: add.w r3, r1, r6, lsl #1 ; CHECK-NEXT: mov r5, r4 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: mov lr, r5 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vabs.f16 q0, q0 -; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: mov r5, lr ; CHECK-NEXT: bne .LBB1_4 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_5: @ %middle.block @@ -249,18 +247,18 @@ ; CHECK-NEXT: b .LBB1_9 ; CHECK-NEXT: .LBB1_6: ; CHECK-NEXT: mov lr, r2 -; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: .LBB1_7: @ %while.body.preheader18 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_8: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: add.w r0, r12, #2 -; CHECK-NEXT: adds r3, #2 +; CHECK-NEXT: vldr.16 s0, [r12] +; CHECK-NEXT: adds r0, r3, #2 +; CHECK-NEXT: add.w r12, r12, #2 ; CHECK-NEXT: vabs.f16 s0, s0 -; CHECK-NEXT: vstr.16 s0, [r12] -; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: vstr.16 s0, [r3] +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: le lr, .LBB1_8 ; CHECK-NEXT: .LBB1_9: @ %while.end ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}