Index: llvm/trunk/lib/Target/ARM/ARM.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARM.td +++ llvm/trunk/lib/Target/ARM/ARM.td @@ -222,6 +222,13 @@ def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", "Has return address stack">; +// Some processors have no branch predictor, which changes the expected cost of +// taking a branch which affects the choice of whether to use predicated +// instructions. +def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor", + "HasBranchPredictor", "false", + "Has no branch predictor">; + /// DSP extension. def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Supports DSP instructions in ARM and/or Thumb2">; @@ -756,13 +763,19 @@ FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>; -def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>; +def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; + +def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, + ProcM3, + FeatureHasNoBranchPredictor]>; -def : ProcNoItin<"cortex-m4", [ARMv7em, +def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, FeatureVFP4, FeatureVFPOnlySP, - FeatureD16]>; + FeatureD16, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, FeatureFPARMv8, @@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; -def : ProcNoItin<"cortex-m33", [ARMv8mMainline, +def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, FeatureDSP, FeatureFPARMv8, FeatureD16, - FeatureVFPOnlySP]>; + FeatureVFPOnlySP, + FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, Index: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1851,9 +1851,9 @@ } bool ARMBaseInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &, +isProfitableToIfCvt(MachineBasicBlock &TBB, unsigned TCycles, unsigned TExtra, - MachineBasicBlock &, + MachineBasicBlock &FBB, unsigned FCycles, unsigned FExtra, BranchProbability Probability) const { if (!TCycles) @@ -1863,14 +1863,43 @@ // Here we scale up each component of UnpredCost to avoid precision issue when // scaling TCycles/FCycles by Probability. const unsigned ScalingUpFactor = 1024; - unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); - unsigned FUnpredCost = + + unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor; + unsigned UnpredCost; + if (!Subtarget.hasBranchPredictor()) { + // When we don't have a branch predictor it's always cheaper to not take a + // branch than take it, so we have to take that into account. + unsigned NotTakenBranchCost = 1; + unsigned TakenBranchCost = Subtarget.getMispredictionPenalty(); + unsigned TUnpredCycles, FUnpredCycles; + if (!FCycles) { + // Triangle: TBB is the fallthrough + TUnpredCycles = TCycles + NotTakenBranchCost; + FUnpredCycles = TakenBranchCost; + } else { + // Diamond: TBB is the block that is branched to, FBB is the fallthrough + TUnpredCycles = TCycles + TakenBranchCost; + FUnpredCycles = FCycles + NotTakenBranchCost; + } + // The total cost is the cost of each path scaled by their probabilites + unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor); + UnpredCost = TUnpredCost + FUnpredCost; + // When predicating assume that the first IT can be folded away but later + // ones cost one cycle each + if (Subtarget.isThumb2() && TCycles + FCycles > 4) { + PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor; + } + } else { + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = Probability.getCompl().scale(FCycles * ScalingUpFactor); - unsigned UnpredCost = TUnpredCost + FUnpredCost; - UnpredCost += 1 * ScalingUpFactor; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + UnpredCost = TUnpredCost + FUnpredCost; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + } - return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; + return PredCost <= UnpredCost; } bool Index: llvm/trunk/lib/Target/ARM/ARMSchedule.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMSchedule.td +++ llvm/trunk/lib/Target/ARM/ARMSchedule.td @@ -424,3 +424,4 @@ include "ARMScheduleSwift.td" include "ARMScheduleR52.td" include "ARMScheduleA57.td" +include "ARMScheduleM3.td" Index: llvm/trunk/lib/Target/ARM/ARMScheduleM3.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMScheduleM3.td +++ llvm/trunk/lib/Target/ARM/ARMScheduleM3.td @@ -0,0 +1,21 @@ +//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM Cortex-M3 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM3Model : SchedMachineModel { + let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue + let MicroOpBufferSize = 0; // In-order + let LoadLatency = 2; // Latency when not pipelined, not pc-relative + let MispredictPenalty = 2; // Best case branch taken cost + + let CompleteModel = 0; +} Index: llvm/trunk/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMSubtarget.h +++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h @@ -246,6 +246,11 @@ /// avoid issue "normal" call instructions to callees which do not return. bool HasRetAddrStack = false; + /// HasBranchPredictor - True if the subtarget has a branch predictor. Having + /// a branch predictor or not changes the expected cost of taking a branch + /// which affects the choice of whether to use predicated instructions. + bool HasBranchPredictor = true; + /// HasMPExtension - True if the subtarget supports Multiprocessing /// extension (ARMv7 only). bool HasMPExtension = false; @@ -554,6 +559,7 @@ bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRetAddrStack() const { return HasRetAddrStack; } + bool hasBranchPredictor() const { return HasBranchPredictor; } bool hasMPExtension() const { return HasMPExtension; } bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } Index: llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll +++ llvm/trunk/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll @@ -0,0 +1,154 @@ +; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP +; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP + +declare void @otherfn() + +; CHECK-LABEL: triangle1: +; CHECK: itt ne +; CHECK: movne +; CHECK: strne +define i32 @triangle1(i32 %n, i32* %p) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: triangle2: +; CHECK-BP: itttt ne +; CHECK-BP: movne +; CHECK-BP: strne +; CHECK-BP: movne +; CHECK-BP: strne +; CHECK-NOBP: cbz +; CHECK-NOBP: movs +; CHECK-NOBP: str +; CHECK-NOBP: movs +; CHECK-NOBP: str +define i32 @triangle2(i32 %n, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + store i32 2, i32* %q, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: triangle3: +; CHECK: cbz +; CHECK: movs +; CHECK: str +; CHECK: movs +; CHECK: str +; CHECK: movs +; CHECK: str +define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + store i32 2, i32* %q, align 4 + store i32 3, i32* %r, align 4 + br label %if.end + +if.end: + tail call void @otherfn() + ret i32 0 +} + +; CHECK-LABEL: diamond1: +; CHECK: ite eq +; CHECK: ldreq +; CHECK: strne +define i32 @diamond1(i32 %n, i32* %p) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 %n, i32* %p, align 4 + br label %if.end + +if.else: + %0 = load i32, i32* %p, align 4 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +} + +; CHECK-LABEL: diamond2: +; CHECK-BP: itte +; CHECK-BP: streq +; CHECK-BP: ldreq +; CHECK-BP: strne +; CHECK-NOBP: cbz +; CHECK-NOBP: str +; CHECK-NOBP: b +; CHECK-NOBP: str +; CHECK-NOBP: ldr +define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 %n, i32* %p, align 4 + br label %if.end + +if.else: + store i32 %m, i32* %q, align 4 + %0 = load i32, i32* %p, align 4 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +} + +; CHECK-LABEL: diamond3: +; CHECK: cbz +; CHECK: movs +; CHECK: str +; CHECK: b +; CHECK: ldr +; CHECK: ldr +; CHECK: adds +define i32 @diamond3(i32 %n, i32* %p, i32* %q) { +entry: + %tobool = icmp eq i32 %n, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: + store i32 1, i32* %p, align 4 + br label %if.end + +if.else: + %0 = load i32, i32* %p, align 4 + %1 = load i32, i32* %q, align 4 + %add = add nsw i32 %1, %0 + br label %if.end + +if.end: + %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ] + tail call void @otherfn() + ret i32 %n.addr.0 +}