Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1017,7 +1017,9 @@ // Find a suitable position to insert the loop start instruction. It needs to // be able to safely define LR. auto FindStartInsertionPoint = [](MachineInstr *Start, - ReachingDefAnalysis &RDA) -> MachineInstr* { + MachineInstr *Dec, + ReachingDefAnalysis &RDA, + InstSet &ToRemove) -> MachineInstr* { // We can define LR because LR already contains the same value. if (Start->getOperand(0).getReg() == ARM::LR) return Start; @@ -1034,23 +1036,31 @@ // Find an insertion point: // - Is there a (mov lr, Count) before Start? If so, and nothing else - // writes to Count before Start, we can insert at that mov. - if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR)) - if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) - return LRDef; + // writes to Count before Start, we can insert at start. + if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR)) { + if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) { + SmallPtrSet Ignore = { Dec }; + TryRemove(LRDef, RDA, ToRemove, Ignore); + return Start; + } + } // - Is there a (mov lr, Count) after Start? If so, and nothing else writes - // to Count after Start, we can insert at that mov. - if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR)) - if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) + // to Count after Start, we can insert at that mov (which will now be + // dead). + if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR)) { + if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) { + ToRemove.insert(LRDef); return LRDef; + } + } // We've found no suitable LR def and Start doesn't use LR directly. Can we // just define LR anyway? return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr; }; - InsertPt = FindStartInsertionPoint(Start, RDA); + InsertPt = FindStartInsertionPoint(Start, Dec, RDA, ToRemove); Revert = !ValidateRanges(Start, End, BBUtils, ML) || !InsertPt; CannotTailPredicate = !ValidateTailPredicate(InsertPt); @@ -1398,7 +1408,7 @@ // Collect and remove the users of iteration count. SmallPtrSet Killed = { LoLoop.Start, LoLoop.Dec, - LoLoop.End, LoLoop.InsertPt }; + LoLoop.End }; if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed)) LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n"); } @@ -1424,9 +1434,6 @@ if (!IsDo) MIB.add(Start->getOperand(1)); - // If we're inserting at a mov lr, then remove it as it's redundant. - if (InsertPt != Start) - LoLoop.ToRemove.insert(InsertPt); LoLoop.ToRemove.insert(Start); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); return &*MIB; Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-mov.mir @@ -78,12 +78,12 @@ ; CHECK: successors: %bb.5(0x80000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: renamable $s4 = nnan ninf nsz VADDS renamable $s0, renamable $s1, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed $r4 ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: renamable $s4 = nnan ninf nsz VADDS renamable $s2, killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $s0 = nnan ninf nsz VADDS killed renamable $s3, killed renamable $s4, 14 /* CC::al */, $noreg, implicit killed $q0 ; CHECK: $s2 = VMOVSR $r1, 14 /* CC::al */, $noreg ; CHECK: renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS killed $r4 ; CHECK: renamable $s4 = nnan ninf nsz VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: bb.5: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix-debug.mir @@ -249,7 +249,7 @@ ; CHECK: renamable $r2 = t2LDRs renamable $r9, renamable $r1, 2, 14 /* CC::al */, $noreg, debug-location !41 :: (load 4 from %ir.arrayidx7.us) ; CHECK: $r3 = tMOVr $r5, 14 /* CC::al */, $noreg, debug-location !32 ; CHECK: $r0 = tMOVr $r8, 14 /* CC::al */, $noreg, debug-location !32 - ; CHECK: $lr = t2DLS renamable $r10, debug-location !32 + ; CHECK: $lr = t2DLS renamable $r10, debug-location !42 ; CHECK: bb.3.for.body3.us: ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r5, $r8, $r9, $r10, $r12 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir @@ -175,8 +175,8 @@ ; CHECK: successors: %bb.3(0x80000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 ; CHECK: $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed $r4 ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0 + ; CHECK: $lr = t2DLS killed $r4 ; CHECK: renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -26,10 +26,10 @@ ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 +; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 Index: llvm/test/CodeGen/Thumb2/mve-float32regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1435,8 +1435,8 @@ ; CHECK-NEXT: vdup.32 q1, r6 ; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vmov.f32 s11, s14 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1