Index: llvm/trunk/lib/Target/ARM/ARMConstantIslandPass.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMConstantIslandPass.cpp +++ llvm/trunk/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -70,6 +71,7 @@ STATISTIC(NumCBZ, "Number of CBZ / CBNZ formed"); STATISTIC(NumJTMoved, "Number of jump table destination blocks moved"); STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted"); +STATISTIC(NumLEInserted, "Number of LE backwards branches inserted"); static cl::opt AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true), @@ -213,6 +215,7 @@ const ARMBaseInstrInfo *TII; const ARMSubtarget *STI; ARMFunctionInfo *AFI; + MachineDominatorTree *DT = nullptr; bool isThumb; bool isThumb1; bool isThumb2; @@ -225,6 +228,12 @@ bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -350,6 +359,7 @@ isPositionIndependentOrROPI = STI->getTargetLowering()->isPositionIndependent() || STI->isROPI(); AFI = MF->getInfo(); + DT = &getAnalysis(); isThumb = AFI->isThumbFunction(); isThumb1 = AFI->isThumb1OnlyFunction(); @@ -1809,16 +1819,10 @@ return MadeChange; } + bool ARMConstantIslands::optimizeThumb2Branches() { - bool MadeChange = false; - // The order in which branches appear in ImmBranches is approximately their - // order within the function body. By visiting later branches first, we reduce - // the distance between earlier forward branches and their targets, making it - // more likely that the cbn?z optimization, which can only apply to forward - // branches, will succeed. - for (unsigned i = ImmBranches.size(); i != 0; --i) { - ImmBranch &Br = ImmBranches[i-1]; + auto TryShrinkBranch = [this](ImmBranch &Br) { unsigned Opcode = Br.MI->getOpcode(); unsigned NewOpc = 0; unsigned Scale = 1; @@ -1846,47 +1850,115 @@ BBUtils->adjustBBSize(MBB, -2); BBUtils->adjustBBOffsetsAfter(MBB); ++NumT2BrShrunk; - MadeChange = true; + return true; } } + return false; + }; - Opcode = Br.MI->getOpcode(); - if (Opcode != ARM::tBcc) - continue; + struct ImmCompare { + MachineInstr* MI = nullptr; + unsigned NewOpc = 0; + }; + + auto FindCmpForCBZ = [this](ImmBranch &Br, ImmCompare &ImmCmp, + MachineBasicBlock *DestBB) { + ImmCmp.MI = nullptr; + ImmCmp.NewOpc = 0; // If the conditional branch doesn't kill CPSR, then CPSR can be liveout // so this transformation is not safe. if (!Br.MI->killsRegister(ARM::CPSR)) - continue; + return false; - NewOpc = 0; unsigned PredReg = 0; + unsigned NewOpc = 0; ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg); if (Pred == ARMCC::EQ) NewOpc = ARM::tCBZ; else if (Pred == ARMCC::NE) NewOpc = ARM::tCBNZ; - if (!NewOpc) - continue; - MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + else + return false; + // Check if the distance is within 126. Subtract starting offset by 2 // because the cmp will be eliminated. unsigned BrOffset = BBUtils->getOffsetOf(Br.MI) + 4 - 2; BBInfoVector &BBInfo = BBUtils->getBBInfo(); unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset; if (BrOffset >= DestOffset || (DestOffset - BrOffset) > 126) - continue; + return false; // Search backwards to find a tCMPi8 auto *TRI = STI->getRegisterInfo(); MachineInstr *CmpMI = findCMPToFoldIntoCBZ(Br.MI, TRI); if (!CmpMI || CmpMI->getOpcode() != ARM::tCMPi8) + return false; + + ImmCmp.MI = CmpMI; + ImmCmp.NewOpc = NewOpc; + return true; + }; + + auto TryConvertToLE = [this](ImmBranch &Br, ImmCompare &Cmp) { + if (Br.MI->getOpcode() != ARM::t2Bcc || !STI->hasLOB() || + STI->hasMinSize()) + return false; + + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + if (BBUtils->getOffsetOf(MBB) < BBUtils->getOffsetOf(DestBB) || + !BBUtils->isBBInRange(Br.MI, DestBB, 4094)) + return false; + + if (!DT->dominates(DestBB, MBB)) + return false; + + // We queried for the CBN?Z opcode based upon the 'ExitBB', the opposite + // target of Br. So now we need to reverse the condition. + Cmp.NewOpc = Cmp.NewOpc == ARM::tCBZ ? ARM::tCBNZ : ARM::tCBZ; + + MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), + TII->get(ARM::t2LE)); + MIB.add(Br.MI->getOperand(0)); + Br.MI->eraseFromParent(); + Br.MI = MIB; + ++NumLEInserted; + return true; + }; + + bool MadeChange = false; + + // The order in which branches appear in ImmBranches is approximately their + // order within the function body. By visiting later branches first, we reduce + // the distance between earlier forward branches and their targets, making it + // more likely that the cbn?z optimization, which can only apply to forward + // branches, will succeed. + for (ImmBranch &Br : reverse(ImmBranches)) { + MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB(); + MachineBasicBlock *MBB = Br.MI->getParent(); + MachineBasicBlock *ExitBB = &MBB->back() == Br.MI ? + MBB->getFallThrough() : + MBB->back().getOperand(0).getMBB(); + + ImmCompare Cmp; + if (FindCmpForCBZ(Br, Cmp, ExitBB) && TryConvertToLE(Br, Cmp)) { + DestBB = ExitBB; + MadeChange = true; + } else { + FindCmpForCBZ(Br, Cmp, DestBB); + MadeChange |= TryShrinkBranch(Br); + } + + unsigned Opcode = Br.MI->getOpcode(); + if ((Opcode != ARM::tBcc && Opcode != ARM::t2LE) || !Cmp.NewOpc) continue; - Register Reg = CmpMI->getOperand(0).getReg(); + Register Reg = Cmp.MI->getOperand(0).getReg(); // Check for Kill flags on Reg. If they are present remove them and set kill // on the new CBZ. + auto *TRI = STI->getRegisterInfo(); MachineBasicBlock::iterator KillMI = Br.MI; bool RegKilled = false; do { @@ -1896,19 +1968,32 @@ RegKilled = true; break; } - } while (KillMI != CmpMI); + } while (KillMI != Cmp.MI); // Create the new CBZ/CBNZ - MachineBasicBlock *MBB = Br.MI->getParent(); - LLVM_DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI); + LLVM_DEBUG(dbgs() << "Fold: " << *Cmp.MI << " and: " << *Br.MI); MachineInstr *NewBR = - BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(NewOpc)) + BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(), TII->get(Cmp.NewOpc)) .addReg(Reg, getKillRegState(RegKilled)) .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags()); - CmpMI->eraseFromParent(); - Br.MI->eraseFromParent(); - Br.MI = NewBR; + + Cmp.MI->eraseFromParent(); + BBInfoVector &BBInfo = BBUtils->getBBInfo(); BBInfo[MBB->getNumber()].Size -= 2; + + if (Br.MI->getOpcode() == ARM::tBcc) { + Br.MI->eraseFromParent(); + Br.MI = NewBR; + } else if (&MBB->back() != Br.MI) { + // We've generated an LE and already erased the original conditional + // branch. The CBN?Z is now used to branch to the other successor, so an + // unconditional branch terminator is now redundant. + MachineInstr *LastMI = &MBB->back(); + if (LastMI != Br.MI) { + BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize(); + LastMI->eraseFromParent(); + } + } BBUtils->adjustBBOffsetsAfter(MBB); ++NumCBZ; MadeChange = true; Index: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll @@ -143,8 +143,8 @@ ; CHECK-NEXT: Thumb2 instruction size reduce pass ; CHECK-NEXT: Unpack machine instruction bundles ; CHECK-NEXT: optimise barriers pass -; CHECK-NEXT: ARM constant island placement and branch shortening pass ; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: ARM constant island placement and branch shortening pass ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: ARM Low Overhead Loops pass ; CHECK-NEXT: Contiguously Lay Out Funclets Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-cbnz.mir @@ -0,0 +1,201 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-LOB +# RUN: llc -mtriple=thumbv8.1m.main -mattr=-lob %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-NOLOB + +# CHECK-NOLOB-NOT: t2LE + +# CHECK-LOB: bb.3.land.rhs: +# CHECK-LOB: renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg +# CHECK-LOB: tCBNZ $r0, %bb.8 +# CHECK-LOB: t2LE %bb.3 +# CHECK-LOB: bb.7.while.body19: +# CHECK-LOB: tCBZ $r0, %bb.8 +# CHECK-LOB: t2LE %bb.6 +# CHECK-LOB: bb.8: + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main" + + %struct.head_s = type { %struct.head_s*, %struct.data_s* } + %struct.data_s = type { i16, i16 } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + entry: + %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 + %0 = load i16, i16* %idx, align 2 + %cmp = icmp sgt i16 %0, -1 + br i1 %cmp, label %while.cond.preheader, label %while.cond9.preheader + + while.cond9.preheader: ; preds = %entry + %1 = icmp eq %struct.head_s* %list, null + br i1 %1, label %return, label %land.rhs11.lr.ph + + land.rhs11.lr.ph: ; preds = %while.cond9.preheader + %data16143 = bitcast %struct.data_s* %info to i16* + %2 = load i16, i16* %data16143, align 2 + %conv15 = sext i16 %2 to i32 + br label %land.rhs11 + + while.cond.preheader: ; preds = %entry + %3 = icmp eq %struct.head_s* %list, null + br i1 %3, label %return, label %land.rhs.preheader + + land.rhs.preheader: ; preds = %while.cond.preheader + br label %land.rhs + + land.rhs: ; preds = %land.rhs.preheader, %while.body + %list.addr.033 = phi %struct.head_s* [ %6, %while.body ], [ %list, %land.rhs.preheader ] + %info2 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.033, i32 0, i32 1 + %4 = load %struct.data_s*, %struct.data_s** %info2, align 4 + %idx3 = getelementptr inbounds %struct.data_s, %struct.data_s* %4, i32 0, i32 1 + %5 = load i16, i16* %idx3, align 2 + %cmp7 = icmp eq i16 %5, %0 + br i1 %cmp7, label %return, label %while.body + + while.body: ; preds = %land.rhs + %next4 = bitcast %struct.head_s* %list.addr.033 to %struct.head_s** + %6 = load %struct.head_s*, %struct.head_s** %next4, align 4 + %tobool = icmp ne %struct.head_s* %6, null + br i1 %tobool, label %return, label %land.rhs + + land.rhs11: ; preds = %while.body19, %land.rhs11.lr.ph + %list.addr.136 = phi %struct.head_s* [ %list, %land.rhs11.lr.ph ], [ %10, %while.body19 ] + %info12 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.136, i32 0, i32 1 + %7 = load %struct.data_s*, %struct.data_s** %info12, align 4 + %data165 = bitcast %struct.data_s* %7 to i16* + %8 = load i16, i16* %data165, align 2 + %9 = and i16 %8, 255 + %and = zext i16 %9 to i32 + %cmp16 = icmp eq i32 %and, %conv15 + br i1 %cmp16, label %return, label %while.body19 + + while.body19: ; preds = %land.rhs11 + %next206 = bitcast %struct.head_s* %list.addr.136 to %struct.head_s** + %10 = load %struct.head_s*, %struct.head_s** %next206, align 4 + %tobool10 = icmp eq %struct.head_s* %10, null + br i1 %tobool10, label %return, label %land.rhs11 + + return: ; preds = %while.body19, %land.rhs11, %while.body, %land.rhs, %while.cond.preheader, %while.cond9.preheader + %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] + ret %struct.head_s* %retval.0 + } + + attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } + +... +--- +name: search +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.5(0x50000000), %bb.1(0x30000000) + liveins: $r0, $r1 + + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) + t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 13, killed $cpsr + + bb.5.while.cond.preheader: + successors: %bb.8(0x30000000), %bb.6(0x50000000) + liveins: $r0, $r2 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.8, 0, killed $cpsr + + bb.6.land.rhs.preheader: + successors: %bb.7(0x80000000) + liveins: $r0, $r2 + + renamable $r1 = tUXTH killed renamable $r2, 14, $noreg + + bb.7.land.rhs: + successors: %bb.8(0x04000000), %bb.7(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) + renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2IT 0, 8, implicit-def $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.7, 0, killed $cpsr + t2B %bb.8, 14, $noreg + + bb.1.while.cond9.preheader: + successors: %bb.8(0x30000000), %bb.2(0x50000000) + liveins: $r0, $r1 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.8, 0, killed $cpsr + + bb.2.land.rhs11.lr.ph: + successors: %bb.3(0x80000000) + liveins: $r0, $r1 + + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) + + bb.3.land.rhs11: + successors: %bb.9(0x04000000), %bb.4(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) + renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data165, align 2) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.9, 0, killed $cpsr + + bb.4.while.body19: + successors: %bb.8(0x04000000), %bb.3(0x7c000000) + liveins: $r0, $r1 + + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next206) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.3, 1, killed $cpsr + + bb.8: + successors: %bb.9(0x80000000) + + renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg + + bb.9.return: + liveins: $r0 + + tBX_RET 14, $noreg, implicit killed $r0 + +... Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-le-simple.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-le-simple.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-le-simple.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main %s -o - | FileCheck %s + +define void @cbz_exit(i32* %in, i32* %res) { +; CHECK-LABEL: cbz_exit: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r2, r0, #4 +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: .LBB0_1: @ %loop +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r3, [r2, #4]! +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: cbz r3, .LBB0_2 +; CHECK-NEXT: le .LBB0_1 +; CHECK-NEXT: .LBB0_2: @ %exit +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr +entry: + br label %loop + +loop: + %offset = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %in, i32 %offset + %val = load i32, i32* %ptr + %next = add i32 %offset, 1 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %exit, label %loop + +exit: + store i32 %offset, i32* %res + ret void +} + +define void @cbnz_exit(i32* %in, i32* %res) { +; CHECK-LABEL: cbnz_exit: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r2, r0, #4 +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: .LBB1_1: @ %loop +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r3, [r2, #4]! +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: cbnz r3, .LBB1_2 +; CHECK-NEXT: le .LBB1_1 +; CHECK-NEXT: .LBB1_2: @ %exit +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr +entry: + br label %loop + +loop: + %offset = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %in, i32 %offset + %val = load i32, i32* %ptr + %next = add i32 %offset, 1 + %cmp = icmp ne i32 %val, 0 + br i1 %cmp, label %exit, label %loop + +exit: + store i32 %offset, i32* %res + ret void +} + +define void @cbnz_exit_too_large(i32* %in, i32* %res) { +; CHECK-LABEL: cbnz_exit_too_large: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r2, r0, #4 +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: .LBB2_1: @ %loop +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r12, [r2, #4]! +; CHECK-NEXT: .zero 4090 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: beq.w .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr +entry: + br label %loop + +loop: + %offset = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %in, i32 %offset + %val = load i32, i32* %ptr + %next = add i32 %offset, 1 + %cmp = icmp ne i32 %val, 0 + %size = call i32 @llvm.arm.space(i32 4090, i32 undef) + br i1 %cmp, label %exit, label %loop + +exit: + store i32 %offset, i32* %res + ret void +} + +define void @cbz_exit_minsize(i32* %in, i32* %res) #0 { +; CHECK-LABEL: cbz_exit_minsize: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: .LBB3_1: @ %loop +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr.w r3, [r0, r2, lsl #2] +; CHECK-NEXT: adds r2, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: bne .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: subs r0, r2, #1 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr +entry: + br label %loop + +loop: + %offset = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %in, i32 %offset + %val = load i32, i32* %ptr + %next = add i32 %offset, 1 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %exit, label %loop + +exit: + store i32 %offset, i32* %res + ret void +} + +define void @cbnz_exit_minsize(i32* %in, i32* %res) #0 { +; CHECK-LABEL: cbnz_exit_minsize: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: .LBB4_1: @ %loop +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr.w r3, [r0, r2, lsl #2] +; CHECK-NEXT: adds r2, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: beq .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %exit +; CHECK-NEXT: subs r0, r2, #1 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr +entry: + br label %loop + +loop: + %offset = phi i32 [ 0, %entry ], [ %next, %loop ] + %ptr = getelementptr i32, i32* %in, i32 %offset + %val = load i32, i32* %ptr + %next = add i32 %offset, 1 + %cmp = icmp ne i32 %val, 0 + br i1 %cmp, label %exit, label %loop + +exit: + store i32 %offset, i32* %res + ret void +} + +attributes #0 = { minsize optsize } + +declare i32 @llvm.arm.space(i32 immarg, i32); Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec-reorder.mir @@ -0,0 +1,184 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s +# CHECK-NOT: t2LE + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + %struct.head_s = type { %struct.head_s*, %struct.data_s* } + %struct.data_s = type { i16, i16 } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + entry: + %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 + %tmp = load i16, i16* %idx, align 2 + %cmp = icmp sgt i16 %tmp, -1 + br i1 %cmp, label %while.cond.preheader, label %while.cond9.preheader + + while.cond9.preheader: ; preds = %entry + %0 = icmp eq %struct.head_s* %list, null + br i1 %0, label %return, label %land.rhs11.lr.ph + + land.rhs11.lr.ph: ; preds = %while.cond9.preheader + %data16143 = bitcast %struct.data_s* %info to i16* + %tmp1 = load i16, i16* %data16143, align 2 + %conv15 = sext i16 %tmp1 to i32 + br label %land.rhs11 + + while.cond.preheader: ; preds = %entry + %1 = icmp eq %struct.head_s* %list, null + br i1 %1, label %return, label %land.rhs.preheader + + land.rhs.preheader: ; preds = %while.cond.preheader + br label %land.rhs + + while.body: ; preds = %land.rhs + %next4 = bitcast %struct.head_s* %list.addr.033 to %struct.head_s** + %tmp4 = load %struct.head_s*, %struct.head_s** %next4, align 4 + %tobool = icmp eq %struct.head_s* %tmp4, null + br i1 %tobool, label %return, label %land.rhs + + land.rhs: ; preds = %land.rhs.preheader, %while.body + %list.addr.033 = phi %struct.head_s* [ %tmp4, %while.body ], [ %list, %land.rhs.preheader ] + %info2 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.033, i32 0, i32 1 + %tmp2 = load %struct.data_s*, %struct.data_s** %info2, align 4 + %idx3 = getelementptr inbounds %struct.data_s, %struct.data_s* %tmp2, i32 0, i32 1 + %tmp3 = load i16, i16* %idx3, align 2 + %cmp7 = icmp eq i16 %tmp3, %tmp + br i1 %cmp7, label %return, label %while.body + + while.body19: ; preds = %land.rhs11 + %next205 = bitcast %struct.head_s* %list.addr.136 to %struct.head_s** + %tmp8 = load %struct.head_s*, %struct.head_s** %next205, align 4 + %tobool10 = icmp eq %struct.head_s* %tmp8, null + br i1 %tobool10, label %return, label %land.rhs11 + + land.rhs11: ; preds = %while.body19, %land.rhs11.lr.ph + %list.addr.136 = phi %struct.head_s* [ %list, %land.rhs11.lr.ph ], [ %tmp8, %while.body19 ] + %info12 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.136, i32 0, i32 1 + %tmp5 = load %struct.data_s*, %struct.data_s** %info12, align 4 + %data166 = bitcast %struct.data_s* %tmp5 to i16* + %tmp6 = load i16, i16* %data166, align 2 + %2 = and i16 %tmp6, 255 + %and = zext i16 %2 to i32 + %cmp16 = icmp eq i32 %and, %conv15 + br i1 %cmp16, label %return, label %while.body19 + + return: ; preds = %land.rhs11, %while.body19, %land.rhs, %while.body, %while.cond.preheader, %while.cond9.preheader + %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] + ret %struct.head_s* %retval.0 + } + + attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } + +... +--- +name: search +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.3(0x50000000), %bb.1(0x30000000) + liveins: $r0, $r1 + + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) + t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 13, killed $cpsr + + bb.3.while.cond.preheader: + successors: %bb.4(0x80000000) + liveins: $r0, $r2 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + renamable $r1 = tUXTH killed renamable $r2, 14, $noreg + + bb.4.land.rhs: + successors: %bb.6(0x04000000), %bb.5(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) + renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.6, 0, killed $cpsr + + bb.5.while.body: + successors: %bb.4(0x7c000000) + liveins: $r0, $r1 + + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.4, 14, $noreg + + bb.6.return: + liveins: $r0 + + tBX_RET 14, $noreg, implicit $r0 + + bb.1.while.cond9.preheader: + successors: %bb.2(0x80000000) + liveins: $r0, $r1 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) + + bb.2.land.rhs11: + successors: %bb.2(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) + renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data166, align 2) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2IT 0, 8, implicit-def $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next205) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + t2B %bb.2, 14, $noreg + +... Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/no-dec.mir @@ -0,0 +1,201 @@ +# RUN: llc -mtriple=thumbv8.1m.main %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-LOB +# RUN: llc -mtriple=thumbv8.1m.main -mattr=-lob %s -run-pass=arm-cp-islands --verify-machineinstrs -o - | FileCheck %s --check-prefix=CHECK-NOLOB + +# CHECK-NOLOB-NOT: t2LE + +# CHECK-LOB: bb.3.land.rhs: +# CHECK-LOB: tCBZ $r0, %bb.8 +# CHECK-LOB: t2LE %bb.3 +# CHECK-LOB: bb.6.land.rhs11: +# CHECK-LOB: bb.7.while.body19: +# CHECK-LOB: tCBZ $r0, %bb.8 +# CHECK-LOB: t2LE %bb.6 +# CHECK-LOB: bb.8: + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-unknown" + + %struct.head_s = type { %struct.head_s*, %struct.data_s* } + %struct.data_s = type { i16, i16 } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcscc %struct.head_s* @search(%struct.head_s* readonly %list, %struct.data_s* nocapture readonly %info) local_unnamed_addr #0 { + entry: + %idx = getelementptr inbounds %struct.data_s, %struct.data_s* %info, i32 0, i32 1 + %0 = load i16, i16* %idx, align 2 + %cmp = icmp sgt i16 %0, -1 + br i1 %cmp, label %while.cond.preheader, label %while.cond9.preheader + + while.cond9.preheader: ; preds = %entry + %1 = icmp eq %struct.head_s* %list, null + br i1 %1, label %return, label %land.rhs11.lr.ph + + land.rhs11.lr.ph: ; preds = %while.cond9.preheader + %data16143 = bitcast %struct.data_s* %info to i16* + %2 = load i16, i16* %data16143, align 2 + %conv15 = sext i16 %2 to i32 + br label %land.rhs11 + + while.cond.preheader: ; preds = %entry + %3 = icmp eq %struct.head_s* %list, null + br i1 %3, label %return, label %land.rhs.preheader + + land.rhs.preheader: ; preds = %while.cond.preheader + br label %land.rhs + + land.rhs: ; preds = %land.rhs.preheader, %while.body + %list.addr.033 = phi %struct.head_s* [ %6, %while.body ], [ %list, %land.rhs.preheader ] + %info2 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.033, i32 0, i32 1 + %4 = load %struct.data_s*, %struct.data_s** %info2, align 4 + %idx3 = getelementptr inbounds %struct.data_s, %struct.data_s* %4, i32 0, i32 1 + %5 = load i16, i16* %idx3, align 2 + %cmp7 = icmp eq i16 %5, %0 + br i1 %cmp7, label %return, label %while.body + + while.body: ; preds = %land.rhs + %next4 = bitcast %struct.head_s* %list.addr.033 to %struct.head_s** + %6 = load %struct.head_s*, %struct.head_s** %next4, align 4 + %tobool = icmp eq %struct.head_s* %6, null + br i1 %tobool, label %return, label %land.rhs + + land.rhs11: ; preds = %while.body19, %land.rhs11.lr.ph + %list.addr.136 = phi %struct.head_s* [ %list, %land.rhs11.lr.ph ], [ %10, %while.body19 ] + %info12 = getelementptr inbounds %struct.head_s, %struct.head_s* %list.addr.136, i32 0, i32 1 + %7 = load %struct.data_s*, %struct.data_s** %info12, align 4 + %data165 = bitcast %struct.data_s* %7 to i16* + %8 = load i16, i16* %data165, align 2 + %9 = and i16 %8, 255 + %and = zext i16 %9 to i32 + %cmp16 = icmp eq i32 %and, %conv15 + br i1 %cmp16, label %return, label %while.body19 + + while.body19: ; preds = %land.rhs11 + %next206 = bitcast %struct.head_s* %list.addr.136 to %struct.head_s** + %10 = load %struct.head_s*, %struct.head_s** %next206, align 4 + %tobool10 = icmp eq %struct.head_s* %10, null + br i1 %tobool10, label %return, label %land.rhs11 + + return: ; preds = %while.body19, %land.rhs11, %while.body, %land.rhs, %while.cond.preheader, %while.cond9.preheader + %retval.0 = phi %struct.head_s* [ null, %while.cond.preheader ], [ null, %while.cond9.preheader ], [ %list.addr.033, %land.rhs ], [ null, %while.body ], [ %list.addr.136, %land.rhs11 ], [ null, %while.body19 ] + ret %struct.head_s* %retval.0 + } + + attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+lob,+ras,+soft-float,+strict-align,+thumb-mode,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8d16,-fp-armv8d16sp,-fp-armv8sp,-fp16,-fp16fml,-fp64,-fpregs,-fullfp16,-neon,-vfp2,-vfp2d16,-vfp2d16sp,-vfp2sp,-vfp3,-vfp3d16,-vfp3d16sp,-vfp3sp,-vfp4,-vfp4d16,-vfp4d16sp,-vfp4sp" "unsafe-fp-math"="false" "use-soft-float"="true" } + +... +--- +name: search +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.5(0x50000000), %bb.1(0x30000000) + liveins: $r0, $r1 + + renamable $r2 = t2LDRSHi12 renamable $r1, 2, 14, $noreg :: (load 2 from %ir.idx) + t2CMPri renamable $r2, -1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 13, killed $cpsr + + bb.5.while.cond.preheader: + successors: %bb.8(0x30000000), %bb.6(0x50000000) + liveins: $r0, $r2 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.8, 0, killed $cpsr + + bb.6.land.rhs.preheader: + successors: %bb.7(0x80000000) + liveins: $r0, $r2 + + renamable $r1 = tUXTH killed renamable $r2, 14, $noreg + + bb.7.land.rhs: + successors: %bb.8(0x04000000), %bb.7(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info2) + renamable $r2 = tLDRHi killed renamable $r2, 1, 14, $noreg :: (load 2 from %ir.idx3) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2IT 0, 8, implicit-def $itstate + tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next4) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.7, 1, killed $cpsr + t2B %bb.8, 14, $noreg + + bb.1.while.cond9.preheader: + successors: %bb.8(0x30000000), %bb.2(0x50000000) + liveins: $r0, $r1 + + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.8, 0, killed $cpsr + + bb.2.land.rhs11.lr.ph: + successors: %bb.3(0x80000000) + liveins: $r0, $r1 + + renamable $r1 = t2LDRSHi12 killed renamable $r1, 0, 14, $noreg :: (load 2 from %ir.data16143) + + bb.3.land.rhs11: + successors: %bb.9(0x04000000), %bb.4(0x7c000000) + liveins: $r0, $r1 + + renamable $r2 = tLDRi renamable $r0, 1, 14, $noreg :: (load 4 from %ir.info12) + renamable $r2 = tLDRBi killed renamable $r2, 0, 14, $noreg :: (load 1 from %ir.data165, align 2) + tCMPr killed renamable $r2, renamable $r1, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.9, 0, killed $cpsr + + bb.4.while.body19: + successors: %bb.8(0x04000000), %bb.3(0x7c000000) + liveins: $r0, $r1 + + renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (load 4 from %ir.next206) + tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr + t2Bcc %bb.3, 1, killed $cpsr + + bb.8: + successors: %bb.9(0x80000000) + + renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg + + bb.9.return: + liveins: $r0 + + tBX_RET 14, $noreg, implicit killed $r0 + +...