diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -37,6 +37,7 @@ Pass *createMVETailPredicationPass(); FunctionPass *createARMLowOverheadLoopsPass(); +FunctionPass *createARMBlockPlacementPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -69,6 +70,7 @@ void initializeMVEVPTBlockPass(PassRegistry &); void initializeMVEVPTOptimisationsPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); +void initializeARMBlockPlacementPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); diff --git a/llvm/lib/Target/ARM/ARMBlockPlacement.cpp b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMBlockPlacement.cpp @@ -0,0 +1,166 @@ +//===-- ARMBlockPlacement.cpp - ARM block placement pass ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass re-arranges machine basic blocks to suit target requirements. +// Currently it only moves blocks to fix backwards WLS branches. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBasicBlockInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-block-placement" +#define DEBUG_PREFIX "ARM Block Placement: " + +namespace llvm { +class ARMBlockPlacement : public MachineFunctionPass { +private: + const ARMBaseInstrInfo *TII; + std::unique_ptr BBUtils = nullptr; + MachineLoopInfo *MLI = nullptr; + +public: + static char ID; + ARMBlockPlacement() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After); + bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace llvm + +FunctionPass *llvm::createARMBlockPlacementPass() { + return new ARMBlockPlacement(); +} + +char ARMBlockPlacement::ID = 0; + +INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false, + false) + +bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) { + const ARMSubtarget &ST = static_cast(MF.getSubtarget()); + if (!ST.hasLOB()) + return false; + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n"); + MLI = &getAnalysis(); + TII = static_cast(ST.getInstrInfo()); + BBUtils = std::unique_ptr(new ARMBasicBlockUtils(MF)); + MF.RenumberBlocks(); + BBUtils->computeAllBlockSizes(); + BBUtils->adjustBBOffsetsAfter(&MF.front()); + bool Changed = false; + + // Find loops with a backwards branching WLS + for (auto ML : *MLI) { + MachineBasicBlock *BB = ML->getLoopPredecessor(); + + for (auto &Terminator : BB->terminators()) { + if (Terminator.getOpcode() != ARM::t2WhileLoopStart) + continue; + MachineBasicBlock *Target = Terminator.getOperand(1).getMBB(); + if (blockIsBefore(BB, Target)) + continue; + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from " + << BB->getFullName() << " to " << Target->getFullName() + << "\n"); + + // Make sure none of the nested loops also have a WLS, as in that case we + // can't move + bool CanMove = true; + SmallVector InnerLoops; + ML->getInnerLoopsInPreorder(*ML, InnerLoops); + for (auto InnerLoop : InnerLoops) { + MachineBasicBlock *InnerPreheader = InnerLoop->getLoopPreheader(); + for (auto &InnerTerminator : InnerPreheader->terminators()) { + if (InnerTerminator.getOpcode() == ARM::t2WhileLoopStart) { + LLVM_DEBUG(dbgs() + << DEBUG_PREFIX + << "Found an inner loop with a WLS. Not moving\n"); + CanMove = false; + break; + } + } + if (!CanMove) + break; + } + + if (CanMove) { + moveBasicBlock(Target, BB); + Changed = true; + break; + } + } + } + + return Changed; +} + +bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB, + MachineBasicBlock *Other) { + return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB); +} + +void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB, + MachineBasicBlock *After) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after " + << After->getName() << "\n"); + MachineBasicBlock *BBPrevious = BB->getPrevNode(); + MachineBasicBlock *AfterNext = After->getNextNode(); + MachineBasicBlock *BBNext = BB->getNextNode(); + + BB->moveAfter(After); + + auto fixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from " + << From->getName() << " to " << To->getName() << "\n"); + assert(From->isSuccessor(To) && + "'To' is expected to be a successor of 'From'"); + MachineInstr &Terminator = *(--From->terminators().end()); + if (!Terminator.isUnconditionalBranch()) { + // The BB doesn't have an unconditional branch so it relied on + // fall-through. Fix by adding an unconditional branch to the moved BB. + unsigned BrOpc = + BBUtils->isBBInRange(&Terminator, To, 254) ? ARM::tB : ARM::t2B; + MachineInstrBuilder MIB = + BuildMI(From, Terminator.getDebugLoc(), TII->get(BrOpc)); + MIB.addMBB(To); + MIB.addImm(ARMCC::CondCodes::AL); + MIB.addReg(ARM::NoRegister); + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from " + << From->getName() << " to " << To->getName() << ": " + << *MIB.getInstr()); + } + }; + + // Fix fall-through to the moved BB from the one that used to be before it + if (BBPrevious && BBPrevious->isSuccessor(BB)) + fixFallthrough(BBPrevious, BB); + // Fix fall through from the destination BB to the one that used to follow + if (AfterNext && After->isSuccessor(AfterNext)) + fixFallthrough(After, AfterNext); + // Fix fall through from the moved BB to the one that used to follow + if (BBNext && BB->isSuccessor(BBNext)) + fixFallthrough(BB, BBNext); + + BBUtils->adjustBBOffsetsAfter(After); +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -99,6 +99,7 @@ initializeMVEVPTOptimisationsPass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); + initializeARMBlockPlacementPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); } @@ -548,6 +549,8 @@ return MF.getSubtarget().isThumb2(); })); + addPass(createARMBlockPlacementPass()); + // Don't optimize barriers at -O0. if (getOptLevel() != CodeGenOpt::None) addPass(createARMOptimizeBarriersPass()); diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -41,6 +41,7 @@ ARMParallelDSP.cpp ARMLoadStoreOptimizer.cpp ARMLowOverheadLoops.cpp + ARMBlockPlacement.cpp ARMMCInstLower.cpp ARMMachineFunctionInfo.cpp ARMMacroFusion.cpp diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -168,6 +168,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: Thumb2 instruction size reduce pass ; CHECK-NEXT: Unpack machine instruction bundles +; CHECK-NEXT: ARM block placement ; CHECK-NEXT: optimise barriers pass ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis diff --git a/llvm/test/CodeGen/Thumb2/block-placement.mir b/llvm/test/CodeGen/Thumb2/block-placement.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/block-placement.mir @@ -0,0 +1,205 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -run-pass=arm-block-placement %s -o - | FileCheck %s +--- | + define void @backwards_branch(i32 %N, i32* nocapture %a, i32* nocapture readonly %b) local_unnamed_addr #0 { + entry: + unreachable + } + + define void @backwards_branch_nested(i32 %N, i32 %M, i32* nocapture %a, i32* nocapture %b, i32* nocapture %c) local_unnamed_addr #0 { + entry: + unreachable + } +... +--- +name: backwards_branch +body: | + ; CHECK-LABEL: name: backwards_branch + ; CHECK: bb.0: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr + ; CHECK: tB %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.1: + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc + ; CHECK: bb.3: + ; CHECK: successors: %bb.3(0x7c000000), %bb.1(0x04000000) + ; CHECK: renamable $r0 = tLDRi renamable $r2, 0, 14 /* CC::al */, $noreg + ; CHECK: tSTRi killed renamable $r0, renamable $r1, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.3, implicit-def dead $cpsr + ; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg + bb.0: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + + bb.1: + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc + + bb.2: + successors: %bb.3(0x80000000) + liveins: $r0, $r1, $r2 + + t2WhileLoopStart killed renamable $r0, %bb.1, implicit-def dead $cpsr + + bb.3: + successors: %bb.3(0x7c000000), %bb.1(0x04000000) + liveins: $lr, $r1, $r2 + + renamable $r0 = tLDRi renamable $r2, 0, 14 /* CC::al */, $noreg + tSTRi killed renamable $r0, renamable $r1, 0, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.3, implicit-def dead $cpsr + t2B %bb.1, 14 /* CC::al */, $noreg + +... +--- +name: backwards_branch_nested +body: | + ; CHECK-LABEL: name: backwards_branch_nested + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x50000000), %bb.5(0x30000000) + ; CHECK: tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.5, 11 /* CC::lt */, killed $cpsr + ; CHECK: bb.1: + ; CHECK: successors: %bb.3(0x50000000), %bb.7(0x30000000) + ; CHECK: renamable $r12 = t2LDRi12 $r0, 8, 14 /* CC::al */, $noreg + ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2Bcc %bb.7, 11 /* CC::lt */, killed $cpsr + ; CHECK: t2B %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.2: + ; CHECK: successors: %bb.3(0x7c000000), %bb.5(0x04000000) + ; CHECK: renamable $r4 = t2LDRi12 renamable $r12, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, $cpsr = nsw tSUBi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = t2ADDri killed renamable $r4, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2STRi12 killed renamable $r4, renamable $r12, 0, 14 /* CC::al */, $noreg + ; CHECK: t2Bcc %bb.3, 12 /* CC::gt */, killed $cpsr + ; CHECK: t2B %bb.5, 14 /* CC::al */, $noreg + ; CHECK: bb.3: + ; CHECK: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK: renamable $r4 = tLDRi renamable $r3, 0, 14 /* CC::al */, $noreg + ; CHECK: t2WhileLoopStart renamable $r1, %bb.2, implicit-def dead $cpsr + ; CHECK: bb.4: + ; CHECK: successors: %bb.4(0x7c000000), %bb.2(0x04000000) + ; CHECK: tSTRi killed renamable $r4, renamable $r2, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = tLDRi renamable $r3, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: tSTRi renamable $r4, renamable $r3, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.4, implicit-def dead $cpsr + ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.5: + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r6, def $r7, def $pc + ; CHECK: bb.7: + ; CHECK: successors: %bb.8(0x40000000), %bb.6(0x40000000) + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r0, 3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = t2LDRi12 renamable $r12, 0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r1, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q1 = MVE_VMOVimmi32 1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r1, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2WhileLoopStart renamable $r0, %bb.6, implicit-def dead $cpsr + ; CHECK: tB %bb.8, 14 /* CC::al */, $noreg + ; CHECK: bb.6: + ; CHECK: t2STRi12 killed renamable $r2, killed renamable $r12, 0, 14 /* CC::al */, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r6, def $r7, def $pc + ; CHECK: bb.8: + ; CHECK: successors: %bb.8(0x7c000000), %bb.6(0x04000000) + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r0, 0, $noreg + ; CHECK: $q2 = MVE_VORR $q0, $q0, 0, $noreg, undef $q2 + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q2 = MVE_VORR renamable $q1, renamable $q1, 1, killed renamable $vpr, killed renamable $q2 + ; CHECK: renamable $r2 = MVE_VADDVu32acc killed renamable $r2, killed renamable $q2, 0, $noreg + ; CHECK: renamable $lr = t2LoopEndDec killed renamable $lr, %bb.8, implicit-def dead $cpsr + ; CHECK: t2B %bb.6, 14 /* CC::al */, $noreg + bb.0: + successors: %bb.1(0x50000000), %bb.5(0x30000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr + + tCMPi8 renamable $r0, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.5, 11 /* CC::lt */, killed $cpsr + + bb.1: + successors: %bb.3(0x50000000), %bb.7(0x30000000) + liveins: $r0, $r1, $r2, $r3 + + renamable $r12 = t2LDRi12 $r0, 8, 14 /* CC::al */, $noreg + tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.7, 11 /* CC::lt */, killed $cpsr + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.2: + successors: %bb.3(0x7c000000), %bb.5(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $r4 = t2LDRi12 renamable $r12, 0, 14 /* CC::al */, $noreg + renamable $r0, $cpsr = nsw tSUBi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg + renamable $r4 = t2ADDri killed renamable $r4, 1, 14 /* CC::al */, $noreg, $noreg + t2STRi12 killed renamable $r4, renamable $r12, 0, 14 /* CC::al */, $noreg + t2Bcc %bb.3, 12 /* CC::gt */, killed $cpsr + t2B %bb.5, 14 /* CC::al */, $noreg + + bb.3: + successors: %bb.4(0x80000000), %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $r4 = tLDRi renamable $r3, 0, 14 /* CC::al */, $noreg + t2WhileLoopStart renamable $r1, %bb.2, implicit-def dead $cpsr + + + bb.4: + successors: %bb.4(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $r0, $r1, $r2, $r3, $r4, $r12 + + tSTRi killed renamable $r4, renamable $r2, 0, 14 /* CC::al */, $noreg + renamable $r4 = tLDRi renamable $r3, 0, 14 /* CC::al */, $noreg + renamable $r4, dead $cpsr = tADDi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + tSTRi renamable $r4, renamable $r3, 0, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.4, implicit-def dead $cpsr + t2B %bb.2, 14 /* CC::al */, $noreg + + bb.5: + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r6, def $r7, def $pc + + bb.6: + liveins: $r2, $r12 + + t2STRi12 killed renamable $r2, killed renamable $r12, 0, 14 /* CC::al */, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r6, def $r7, def $pc + + bb.7: + successors: %bb.8(0x80000000), %bb.6(0x80000000) + liveins: $r0, $r12 + + renamable $r3, dead $cpsr = tADDi3 renamable $r0, 3, 14 /* CC::al */, $noreg + renamable $r2 = t2LDRi12 renamable $r12, 0, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r1, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $q1 = MVE_VMOVimmi32 1, 0, $noreg, undef renamable $q1 + renamable $lr = nuw nsw t2ADDrs killed renamable $r1, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg + t2WhileLoopStart renamable $r0, %bb.6, implicit-def dead $cpsr + + bb.8: + successors: %bb.8(0x7c000000), %bb.6(0x04000000) + liveins: $lr, $q0, $q1, $r0, $r2, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r0, 0, $noreg + $q2 = MVE_VORR $q0, $q0, 0, $noreg, undef $q2 + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 4, 14 /* CC::al */, $noreg + MVE_VPST 8, implicit $vpr + renamable $q2 = MVE_VORR renamable $q1, renamable $q1, 1, killed renamable $vpr, killed renamable $q2 + renamable $r2 = MVE_VADDVu32acc killed renamable $r2, killed renamable $q2, 0, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.8, implicit-def dead $cpsr + t2B %bb.6, 14 /* CC::al */, $noreg + +... diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1100,19 +1100,10 @@ ; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 -; CHECK-NEXT: add.w r5, r0, #8 -; CHECK-NEXT: beq.w .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_3: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_5 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #8 ; CHECK-NEXT: ldrh.w lr, [r3, #14] @@ -1150,14 +1141,14 @@ ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_6 +; CHECK-NEXT: @ %bb.4: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_5: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r5] @@ -1188,32 +1179,39 @@ ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r1 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_5 +; CHECK-NEXT: b .LBB16_7 +; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: beq.w .LBB16_3 +; CHECK-NEXT: .LBB16_7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 +; CHECK-NEXT: wls lr, r8, .LBB16_8 ; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_8: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vstrb.8 q0, [r2], #8 +; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: beq .LBB16_12 +; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov lr, r8 ; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r1, [r6], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vfma.f16 q0, q1, r1 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: add.w r5, r5, r8, lsl #1 -; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: b .LBB16_8 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1072,18 +1072,10 @@ ; CHECK-NEXT: str r6, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r5, r0, lsl #2 -; CHECK-NEXT: add.w r5, r0, #16 -; CHECK-NEXT: beq .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_3: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_5 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -1110,14 +1102,14 @@ ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_6 +; CHECK-NEXT: @ %bb.4: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_5: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldm.w r7, {r0, r3, r4, r6} ; CHECK-NEXT: vldrw.u32 q1, [r5], #32 @@ -1140,34 +1132,40 @@ ; CHECK-NEXT: adds r7, #32 ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: vfma.f32 q0, q1, r9 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_5 +; CHECK-NEXT: b .LBB16_7 +; CHECK-NEXT: .LBB16_6: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_7: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload ; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp.w r4, #0 -; CHECK-NEXT: beq .LBB16_3 +; CHECK-NEXT: wls lr, r4, .LBB16_8 ; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: .LBB16_8: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: subs.w r12, r12, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: add.w r0, r5, r0, lsl #2 +; CHECK-NEXT: add.w r5, r0, #16 +; CHECK-NEXT: beq .LBB16_12 +; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: mov lr, r4 ; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ Parent Loop BB16_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldr r0, [r7], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 -; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_3 Depth=1 ; CHECK-NEXT: add.w r5, r5, r4, lsl #2 -; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: b .LBB16_8 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}