Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -646,6 +646,9 @@ /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return True if target can execute instructions out of order. + bool isOutOfOrder() const; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -1018,6 +1021,7 @@ Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; + virtual bool isOutOfOrder() const = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; @@ -1295,6 +1299,9 @@ Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } + bool isOutOfOrder() const override { + return Impl.isOutOfOrder(); + } unsigned getNumberOfRegisters(bool Vector) override { return Impl.getNumberOfRegisters(Vector); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -337,6 +337,8 @@ return TTI::TCC_Free; } + bool isOutOfOrder() const { return false; } + unsigned getNumberOfRegisters(bool Vector) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -398,6 +398,10 @@ return BaseT::getInstructionLatency(I); } + bool isOutOfOrder() const { + return getST()->getSchedModel().isOutOfOrder(); + } + /// @} /// \name Vector TTI Implementations Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -314,6 +314,10 @@ return Cost; } +bool TargetTransformInfo::isOutOfOrder() const { + return TTIImpl->isOutOfOrder(); +} + unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return TTIImpl->getNumberOfRegisters(Vector); } Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,16 @@ cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt DependenceChainLatency( + "dependence-chain-latency", cl::Hidden, cl::init(8), + cl::desc("Limit the maximum latency of dependence chain containing cmp " + "for if conversion")); + +static cl::opt SmallBBSize( + "small-bb-size", cl::Hidden, cl::init(40), + cl::desc("Check dependence chain latency only in basic block smaller than " + "this number")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -395,6 +405,166 @@ return true; } +/// Estimate the code size of the specified BB. +static unsigned CountBBCodeSize(BasicBlock *BB, + const TargetTransformInfo &TTI) { + unsigned Size = 0; + for (auto II = BB->begin(); !isa(II); ++II) + Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); + return Size; +} + +/// Find out the latency of the longest dependence chain in the BB if +/// LongestChain is true, or the dependence chain containing the compare +/// instruction feeding the block's conditional branch. +static unsigned FindDependenceChainLatency(BasicBlock *BB, + DenseMap &Instructions, + const TargetTransformInfo &TTI, + bool LongestChain) { + unsigned MaxLatency = 0; + + BasicBlock::iterator II; + for (II = BB->begin(); !isa(II); ++II) { + unsigned Latency = 0; + for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { + Instruction *Op = dyn_cast(II->getOperand(O)); + if (Op && Instructions.count(Op)) { + auto OpLatency = Instructions[Op]; + if (OpLatency > Latency) + Latency = OpLatency; + } + } + Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); + Instructions[&(*II)] = Latency; + + if (Latency > MaxLatency) + MaxLatency = Latency; + } + + if (LongestChain) + return MaxLatency; + + // The length of the dependence chain containing the compare instruction is + // wanted, so the terminator must be a BranchInst. + assert(isa(II)); + BranchInst* Br = cast(II); + Instruction *Cmp = dyn_cast(Br->getCondition()); + if (Cmp && Instructions.count(Cmp)) + return Instructions[Cmp]; + else + return 0; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + DenseMap &BB1Instructions) { + unsigned LastLatency = 0; + SmallVector Worklist; + BasicBlock::iterator II; + for (II = BB2->begin(); !isa(II); ++II) { + if (PHINode *PN = dyn_cast(II)) { + // Look for users in BB2. + bool InBBUser = false; + for (User *U : PN->users()) { + if (cast(U)->getParent() == BB2) { + InBBUser = true; + break; + } + } + // No such user, we don't care about this instruction and its operands. + if (!InBBUser) + break; + } + Worklist.push_back(&(*II)); + } + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { + if (Instruction *Op = dyn_cast(I->getOperand(O))) { + if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2) + Worklist.push_back(Op); + else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) { + if (BB1Instructions[Op] > LastLatency) + LastLatency = BB1Instructions[Op]; + } + } + } + } + + return LastLatency; +} + +/// If after if conversion, most of the instructions in this new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the control dependence is +/// transformed into data dependence, and control dependence can be speculated, +/// and thus, the second part can execute in parallel with the first part on +/// modern OOO processor. +/// +/// To check this condition, this function finds the length of the dependence +/// chain in BB1 (only the part that can be executed in parallel with code after +/// branch in BB2) containing cmp, and if the length is longer than a threshold, +/// don't perform if conversion. +/// +/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion. +/// SpeculationSize contains the code size of IfBlock1 and IfBlock2. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + unsigned SpeculationSize, + const TargetTransformInfo &TTI) { + // Accumulated latency of each instruction in their BBs. + DenseMap BB1Instructions; + DenseMap BB2Instructions; + + if (!TTI.isOutOfOrder()) + return false; + + unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) + + SpeculationSize; + + // We check small BB only since it is more difficult to find unrelated + // instructions to fill functional units in a small BB. + if (NewBBSize > SmallBBSize) + return false; + + auto BB1Chain = + FindDependenceChainLatency(BB1, BB1Instructions, TTI, false); + auto BB2Chain = + FindDependenceChainLatency(BB2, BB2Instructions, TTI, true); + + // If there are many unrelated instructions in the new BB, there will be + // other instructions for the processor to issue regardless of the length + // of this new dependence chain. + // Modern processors can issue 3 or more instructions in each cycle. But in + // real world applications, an IPC of 2 is already very good for non-loop + // code with small basic blocks. Higher IPC is usually found in programs with + // small kernel. So IPC of 2 is more reasonable for most applications. + if ((BB1Chain + BB2Chain) * 2 <= NewBBSize) + return false; + + // We only care about part of the dependence chain in BB1 that can be + // executed in parallel with BB2, so adjust the latency. + BB1Chain -= + LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions); + + // Correctly predicted branch instruction can skip the dependence chain in + // BB1, but misprediction has a penalty, so only when the dependence chain is + // longer than DependenceChainLatency, then branch is better than select. + // Besides misprediction penalty, the threshold value DependenceChainLatency + // also depends on branch misprediction rate, taken branch latency and cmov + // latency. + if (BB1Chain >= DependenceChainLatency) + return true; + + return false; +} + /// Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -2044,6 +2214,11 @@ if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue)) return false; + // Don't do if conversion for long dependence chain. + if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, + CountBBCodeSize(ThenBB, TTI), TTI)) + return false; + // If we get here, we can hoist the instruction and if-convert. DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2351,6 +2526,10 @@ } } + if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, + AggressiveInsts.size(), TTI)) + return false; + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); Index: test/Transforms/SimplifyCFG/X86/if-conversion.ll =================================================================== --- test/Transforms/SimplifyCFG/X86/if-conversion.ll +++ test/Transforms/SimplifyCFG/X86/if-conversion.ll @@ -0,0 +1,231 @@ +; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s +; Avoid if-conversion if there is a long dependence chain. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; The first several cases test FindLongDependenceChain returns true, so +; if-conversion is blocked. + +define i64 @test1(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-NOT: select +} + +define i64 @test2(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = add i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test2 +; CHECK-NOT: select +} + +; The following cases test FindLongDependenceChain returns false, so +; if-conversion will proceed. + +; Non trivial LatencyAdjustment. +define i64 @test3(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %p4 = add i64 %p3, %1 + %ptr = inttoptr i64 %p4 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test3 +; CHECK: select +} + +; Short dependence chain. +define i64 @test4(i64* %pp, i64* %p) { +entry: + %0 = load i64, i64* %pp, align 8 + %cmp = icmp slt i64 %0, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test4 +; CHECK: select +} + +; High IPC. +define i64 @test5(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + %2 = add i64 %pint, 2 + %3 = add i64 %pint, 3 + %4 = or i64 %pint, 16 + %5 = and i64 %pint, 255 + + %6 = or i64 %2, 9 + %7 = and i64 %3, 255 + %8 = add i64 %4, 4 + %9 = add i64 %5, 5 + + %10 = add i64 %6, 2 + %11 = add i64 %7, 3 + %12 = add i64 %8, 4 + %13 = add i64 %9, 5 + + %14 = add i64 %10, 6 + %15 = add i64 %11, 7 + %16 = add i64 %12, 8 + %17 = add i64 %13, 9 + + %18 = add i64 %14, 10 + %19 = add i64 %15, 11 + %20 = add i64 %16, 12 + %21 = add i64 %17, 13 + + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + + ret i64 %val + +; CHECK-LABEL: @test5 +; CHECK: select +} + +; Large BB size. +define i64 @test6(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + %2 = add i64 %pint, 2 + %3 = add i64 %pint, 3 + %4 = add i64 %2, 4 + %5 = add i64 %3, 5 + %6 = add i64 %4, 6 + %7 = add i64 %5, 7 + %8 = add i64 %6, 6 + %9 = add i64 %7, 7 + %10 = add i64 %8, 6 + %11 = add i64 %9, 7 + %12 = add i64 %10, 6 + %13 = add i64 %11, 7 + %14 = add i64 %12, 6 + %15 = add i64 %13, 7 + %16 = add i64 %14, 6 + %17 = add i64 %15, 7 + %18 = add i64 %16, 6 + %19 = add i64 %17, 7 + %20 = add i64 %18, 6 + %21 = add i64 %19, 7 + %22 = add i64 %20, 6 + %23 = add i64 %21, 7 + %24 = add i64 %22, 6 + %25 = add i64 %23, 7 + %26 = add i64 %24, 6 + %27 = add i64 %25, 7 + %28 = add i64 %26, 6 + %29 = add i64 %27, 7 + %30 = add i64 %28, 6 + %31 = add i64 %29, 7 + %32 = add i64 %30, 8 + %33 = add i64 %31, 9 + %34 = add i64 %32, %33 + %35 = and i64 %34, 255 + %res = add i64 %val, %35 + + ret i64 %res + +; CHECK-LABEL: @test6 +; CHECK: select +}