diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp --- a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp +++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp @@ -38,6 +38,8 @@ bool runOnMachineFunction(MachineFunction &MF) override; void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After); bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other); + void fixWLS(MachineLoop *ML, bool &Changed); + void processPostOrderLoops(MachineLoop *ML, bool &Changed); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -57,9 +59,143 @@ INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false, false) +static bool containsWLS(MachineBasicBlock *MBB) { + for (auto &Terminator : MBB->terminators()) { + if (Terminator.getOpcode() == ARM::t2WhileLoopStartLR) + return true; + } + return false; +} + +/// Tries to find t2WhileLoopStartLR in the loop predecessor or otherwise +/// in its only predecessor +static MachineBasicBlock *findWLS(MachineLoop *ML) { + MachineBasicBlock *Preheader = ML->getLoopPredecessor(); + if (!Preheader) + return nullptr; + if (containsWLS(Preheader)) + return Preheader; + if (Preheader->pred_size() == 1 && containsWLS(*(Preheader->pred_begin()))) + return *(Preheader->pred_begin()); + return nullptr; +} + +/// Check if loop has a backwards branching WLS, and if possible, fix it. +/// This requires checking the preheader (or it's predecessor) for a WLS and if +/// its target is before it. +/// If moving the target block wouldn't produce another backwards WLS or a new +/// forwards LE branch, then move the target block after the preheader (or it's +/// predecessor). +void ARMBlockPlacement::fixWLS(MachineLoop *ML, bool &Changed) { + MachineBasicBlock *Preheader = findWLS(ML); + if (!Preheader) + return; + + for (auto &Terminator : Preheader->terminators()) { + if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR) + continue; + MachineBasicBlock *LoopExit = Terminator.getOperand(2).getMBB(); + // We don't want to move the function's entry block. + if (!LoopExit->getPrevNode()) + continue; + if (blockIsBefore(Preheader, LoopExit)) + continue; + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from " + << Preheader->getFullName() << " to " + << LoopExit->getFullName() << "\n"); + + // Make sure that moving the target block doesn't cause any of its WLSs + // that were previously not backwards to become backwards + bool CanMove = true; + for (auto &LoopExitTerminator : LoopExit->terminators()) { + if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStartLR) + continue; + // An example loop structure where the LoopExit can't be moved, since + // bb1's WLS will become backwards once it's moved after bb3 + // bb1 (LoopExit): + // WLS bb2 - LoopExit2 + // bb2: + // ... + // bb3: - Preheader + // WLS bb1 + // bb4: - Header + MachineBasicBlock *LoopExit2 = LoopExitTerminator.getOperand(2).getMBB(); + // If the WLS from LoopExit to LoopExit2 is already backwards then + // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is + // after the Preheader then moving will keep it as a forward branch, so it + // can be moved. If LoopExit2 is between the Preheader and LoopExit then + // moving LoopExit will make it a backwards branch, so it can't be moved + // since we'd fix one and introduce one backwards branch. + // TODO: Analyse the blocks to make a decision if it would be worth + // moving LoopExit even if LoopExit2 is between the Preheader and + // LoopExit. + if (!blockIsBefore(LoopExit2, LoopExit) && + (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX + << "Can't move the target block as it would " + "introduce a new backwards WLS branch\n"); + CanMove = false; + break; + } + } + + if (CanMove) { + // Make sure no LEs become forwards. + // An example loop structure where the LoopExit can't be moved, since + // bb2's LE will become forwards once bb1 is moved after bb3. + // bb1: - LoopExit + // bb2: + // LE bb1 - Terminator + // bb3: - Preheader + // WLS bb1 + // bb4: - Header + for (auto It = LoopExit->getIterator(); It != Preheader->getIterator(); + It++) { + MachineBasicBlock *MBB = &*It; + for (auto &Terminator : MBB->terminators()) { + if (Terminator.getOpcode() != ARM::t2LoopEnd && + Terminator.getOpcode() != ARM::t2LoopEndDec) + continue; + MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB(); + // The LE will become forwards branching if it branches to LoopExit + // which isn't allowed by the architecture, so we should avoid + // introducing these. + // TODO: Analyse the blocks to make a decision if it would be worth + // moving LoopExit even if we'd introduce a forwards LE + if (LETarget == LoopExit) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX + << "Can't move the target block as it would " + "introduce a new forwards LE branch\n"); + CanMove = false; + break; + } + } + } + + if (!CanMove) + break; + } + + if (CanMove) { + // doesnt have to be Preheader, refers to any BB that contains + // t2WhileLoopStartLR + moveBasicBlock(LoopExit, Preheader); + Changed = true; + break; + } + } +} + +void ARMBlockPlacement::processPostOrderLoops(MachineLoop *ML, bool &Changed) { + for (auto *InnerML : *ML) { + processPostOrderLoops(InnerML, Changed); + } + fixWLS(ML, Changed); +} + bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) - return false; + return false; const ARMSubtarget &ST = static_cast(MF.getSubtarget()); if (!ST.hasLOB()) return false; @@ -72,108 +208,9 @@ BBUtils->adjustBBOffsetsAfter(&MF.front()); bool Changed = false; - // Find loops with a backwards branching WLS. - // This requires looping over the loops in the function, checking each - // preheader for a WLS and if its target is before the preheader. If moving - // the target block wouldn't produce another backwards WLS or a new forwards - // LE branch then move the target block after the preheader. + // Find loops with a backwards branching WLS and fix if possible. for (auto *ML : *MLI) { - MachineBasicBlock *Preheader = ML->getLoopPredecessor(); - if (!Preheader) - continue; - - for (auto &Terminator : Preheader->terminators()) { - if (Terminator.getOpcode() != ARM::t2WhileLoopStartLR) - continue; - MachineBasicBlock *LoopExit = Terminator.getOperand(2).getMBB(); - // We don't want to move the function's entry block. - if (!LoopExit->getPrevNode()) - continue; - if (blockIsBefore(Preheader, LoopExit)) - continue; - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from " - << Preheader->getFullName() << " to " - << LoopExit->getFullName() << "\n"); - - // Make sure that moving the target block doesn't cause any of its WLSs - // that were previously not backwards to become backwards - bool CanMove = true; - for (auto &LoopExitTerminator : LoopExit->terminators()) { - if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStartLR) - continue; - // An example loop structure where the LoopExit can't be moved, since - // bb1's WLS will become backwards once it's moved after bb3 bb1: - - // LoopExit - // WLS bb2 - LoopExit2 - // bb2: - // ... - // bb3: - Preheader - // WLS bb1 - // bb4: - Header - MachineBasicBlock *LoopExit2 = - LoopExitTerminator.getOperand(2).getMBB(); - // If the WLS from LoopExit to LoopExit2 is already backwards then - // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is - // after the Preheader then moving will keep it as a forward branch, so - // it can be moved. If LoopExit2 is between the Preheader and LoopExit - // then moving LoopExit will make it a backwards branch, so it can't be - // moved since we'd fix one and introduce one backwards branch. - // TODO: Analyse the blocks to make a decision if it would be worth - // moving LoopExit even if LoopExit2 is between the Preheader and - // LoopExit. - if (!blockIsBefore(LoopExit2, LoopExit) && - (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX - << "Can't move the target block as it would " - "introduce a new backwards WLS branch\n"); - CanMove = false; - break; - } - } - - if (CanMove) { - // Make sure no LEs become forwards. - // An example loop structure where the LoopExit can't be moved, since - // bb2's LE will become forwards once bb1 is moved after bb3. - // bb1: - LoopExit - // bb2: - // LE bb1 - Terminator - // bb3: - Preheader - // WLS bb1 - // bb4: - Header - for (auto It = LoopExit->getIterator(); It != Preheader->getIterator(); - It++) { - MachineBasicBlock *MBB = &*It; - for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() != ARM::t2LoopEnd && - Terminator.getOpcode() != ARM::t2LoopEndDec) - continue; - MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB(); - // The LE will become forwards branching if it branches to LoopExit - // which isn't allowed by the architecture, so we should avoid - // introducing these. - // TODO: Analyse the blocks to make a decision if it would be worth - // moving LoopExit even if we'd introduce a forwards LE - if (LETarget == LoopExit) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX - << "Can't move the target block as it would " - "introduce a new forwards LE branch\n"); - CanMove = false; - break; - } - } - } - - if (!CanMove) - break; - } - - if (CanMove) { - moveBasicBlock(LoopExit, Preheader); - Changed = true; - break; - } - } + processPostOrderLoops(ML, Changed); } return Changed; diff --git a/llvm/test/CodeGen/Thumb2/block-placement.mir b/llvm/test/CodeGen/Thumb2/block-placement.mir --- a/llvm/test/CodeGen/Thumb2/block-placement.mir +++ b/llvm/test/CodeGen/Thumb2/block-placement.mir @@ -31,6 +31,11 @@ unreachable } + define void @nested_loops(i32 %n, i32 %m, i32 %l, i8* noalias %X, i8* noalias %Y) local_unnamed_addr #0 { + entry: + unreachable + } + declare dso_local i32 @g(...) local_unnamed_addr #1 declare dso_local i32 @h(...) local_unnamed_addr #1 @@ -441,3 +446,188 @@ bb.5: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc ... +--- +name: nested_loops +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$r0' } + - { reg: '$r1' } + - { reg: '$r2' } + - { reg: '$r3' } +frameInfo: + stackSize: 32 + maxAlignment: 4 + maxCallFrameSize: 0 +fixedStack: + - { id: 0, size: 4, alignment: 8, isImmutable: true } +stack: + - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '$lr', + callee-saved-restored: false } + - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '$r10' } + - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, callee-saved-register: '$r9' } + - { id: 3, type: spill-slot, offset: -16, size: 4, alignment: 4, callee-saved-register: '$r8' } + - { id: 4, type: spill-slot, offset: -20, size: 4, alignment: 4, callee-saved-register: '$r7' } + - { id: 5, type: spill-slot, offset: -24, size: 4, alignment: 4, callee-saved-register: '$r6' } + - { id: 6, type: spill-slot, offset: -28, size: 4, alignment: 4, callee-saved-register: '$r5' } + - { id: 7, type: spill-slot, offset: -32, size: 4, alignment: 4, callee-saved-register: '$r4' } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: nested_loops + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 32 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -20 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -24 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -28 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -32 + ; CHECK: tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate + ; CHECK: bb.1: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: $r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg + ; CHECK: renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.2: + ; CHECK: successors: %bb.9(0x04000000), %bb.3(0x7c000000) + ; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12 + ; CHECK: renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg + ; CHECK: tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x50000000), %bb.2(0x30000000) + ; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12 + ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, killed $cpsr + ; CHECK: bb.4: + ; CHECK: successors: %bb.6(0x80000000) + ; CHECK: liveins: $r0, $r1, $r3, $r8, $r9, $r12 + ; CHECK: renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + ; CHECK: $r10 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: $r2 = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: t2B %bb.6, 14 /* CC::al */, $noreg + ; CHECK: bb.6: + ; CHECK: successors: %bb.7(0x50000000), %bb.5(0x30000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + ; CHECK: renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.5, implicit-def dead $cpsr + ; CHECK: tB %bb.7, 14 /* CC::al */, $noreg + ; CHECK: bb.5: + ; CHECK: successors: %bb.2(0x04000000), %bb.6(0x7c000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + ; CHECK: renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg + ; CHECK: tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2Bcc %bb.2, 0 /* CC::eq */, killed $cpsr + ; CHECK: tB %bb.6, 14 /* CC::al */, $noreg + ; CHECK: bb.7: + ; CHECK: successors: %bb.8(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + ; CHECK: $r5 = tMOVr $r10, 14 /* CC::al */, $noreg + ; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: t2B %bb.8, 14 /* CC::al */, $noreg + ; CHECK: bb.8: + ; CHECK: successors: %bb.8(0x7c000000), %bb.5(0x04000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12 + ; CHECK: tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.8, implicit-def dead $cpsr + ; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg + ; CHECK: bb.9: + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc + bb.0: + successors: %bb.1 + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 32 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r10, -8 + frame-setup CFI_INSTRUCTION offset $r9, -12 + frame-setup CFI_INSTRUCTION offset $r8, -16 + frame-setup CFI_INSTRUCTION offset $r7, -20 + frame-setup CFI_INSTRUCTION offset $r6, -24 + frame-setup CFI_INSTRUCTION offset $r5, -28 + frame-setup CFI_INSTRUCTION offset $r4, -32 + tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + $sp = frame-destroy t2LDMIA_RET $sp, 11 /* CC::lt */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc, implicit killed $itstate + + bb.1: + liveins: $r0, $r1, $r2, $r3 + + renamable $r12 = t2LDRi12 $sp, 32, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + $r9 = tMOVr killed $r2, 14 /* CC::al */, $noreg + renamable $r8 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg + t2B %bb.2, 14 /* CC::al */, $noreg + + bb.8: + successors: %bb.9(0x04000000), %bb.2(0x7c000000) + liveins: $r0, $r1, $r3, $r8, $r9, $r12 + + renamable $r8 = nuw nsw t2ADDri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg + tCMPhir renamable $r8, renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + renamable $r12 = t2ADDri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg + t2Bcc %bb.9, 0 /* CC::eq */, killed $cpsr + + bb.2: + successors: %bb.3(0x50000000), %bb.8(0x30000000) + liveins: $r0, $r1, $r3, $r8, $r9, $r12 + + tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.8, 11 /* CC::lt */, killed $cpsr + + bb.3: + liveins: $r0, $r1, $r3, $r8, $r9, $r12 + + renamable $r4, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg + $r10 = tMOVr $r12, 14 /* CC::al */, $noreg + $r2 = tMOVr $r3, 14 /* CC::al */, $noreg + t2B %bb.4, 14 /* CC::al */, $noreg + + bb.7: + successors: %bb.8(0x04000000), %bb.4(0x7c000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + + renamable $r4, dead $cpsr = nuw nsw tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 1, 14 /* CC::al */, $noreg + tCMPr renamable $r4, renamable $r1, 14 /* CC::al */, $noreg, implicit-def $cpsr + renamable $r10 = t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + t2Bcc %bb.8, 0 /* CC::eq */, killed $cpsr + + bb.4: + successors: %bb.5(0x50000000), %bb.7(0x30000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + + renamable $lr = t2WhileLoopStartLR killed renamable $r9, %bb.7, implicit-def dead $cpsr + + bb.5: + liveins: $r0, $r1, $r2, $r3, $r4, $r8, $r9, $r10, $r12 + + $r5 = tMOVr $r10, 14 /* CC::al */, $noreg + $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + t2B %bb.6, 14 /* CC::al */, $noreg + + bb.6: + successors: %bb.6(0x7c000000), %bb.7(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r8, $r9, $r10, $r12 + + tSTRi killed $r0, $r1, 0, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.6, implicit-def dead $cpsr + t2B %bb.7, 14 /* CC::al */, $noreg + + bb.9: + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1077,18 +1077,10 @@ ; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 -; CHECK-NEXT: add.w r5, r0, #8 -; CHECK-NEXT: beq.w .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_3: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_5 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: ldrh.w lr, [r3, #14] @@ -1125,14 +1117,14 @@ ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_6 +; CHECK-NEXT: @ %bb.4: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_5: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r5] @@ -1163,33 +1155,39 @@ ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_5 +; CHECK-NEXT: b .LBB16_7 +; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: subs.w lr, r0, #0 -; CHECK-NEXT: beq.w .LBB16_3 +; CHECK-NEXT: wls lr, r0, .LBB16_8 ; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_8: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #8 +; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: beq .LBB16_12 +; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r4, [r6], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r5, r5, r0, lsl #1 -; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: b .LBB16_8 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1071,18 +1071,10 @@ ; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r4, r0, lsl #2 -; CHECK-NEXT: add.w r4, r0, #16 -; CHECK-NEXT: beq .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_3: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_5 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -1109,14 +1101,14 @@ ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_6 +; CHECK-NEXT: @ %bb.4: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_5: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} ; CHECK-NEXT: vldrw.u32 q1, [r4], #32 @@ -1137,34 +1129,40 @@ ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_5 +; CHECK-NEXT: b .LBB16_7 +; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: subs.w lr, r0, #0 -; CHECK-NEXT: beq .LBB16_3 +; CHECK-NEXT: wls lr, r0, .LBB16_8 ; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_8: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: add.w r0, r4, r0, lsl #2 +; CHECK-NEXT: add.w r4, r0, #16 +; CHECK-NEXT: beq .LBB16_12 +; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr r0, [r7], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: add.w r4, r4, r0, lsl #2 -; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: b .LBB16_8 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}