Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -640,6 +640,9 @@ /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return True if target can execute instructions out of order. + bool isOutOfOrder() const; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -1017,6 +1020,7 @@ Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; + virtual bool isOutOfOrder() const = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; @@ -1290,6 +1294,9 @@ Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } + bool isOutOfOrder() const override { + return Impl.isOutOfOrder(); + } unsigned getNumberOfRegisters(bool Vector) override { return Impl.getNumberOfRegisters(Vector); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -335,6 +335,8 @@ return TTI::TCC_Free; } + bool isOutOfOrder() const { return false; } + unsigned getNumberOfRegisters(bool Vector) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -394,6 +394,10 @@ return BaseT::getInstructionLatency(I); } + bool isOutOfOrder() const { + return getST()->getSchedModel().isOutOfOrder(); + } + /// @} /// \name Vector TTI Implementations Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -315,6 +315,10 @@ return Cost; } +bool TargetTransformInfo::isOutOfOrder() const { + return TTIImpl->isOutOfOrder(); +} + unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return TTIImpl->getNumberOfRegisters(Vector); } Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,11 @@ cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt DependenceChainLatency( + "dependence-chain-latency", cl::Hidden, cl::init(8), + cl::desc("Limit the maximum latency of dependence chain containing cmp " + "for if conversion")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -395,6 +400,148 @@ return true; } +/// Estimate the code size of the specified BB. +static unsigned CountBBCodeSize(BasicBlock *BB, + const TargetTransformInfo &TTI) { + unsigned Size = 0; + for (auto II = BB->begin(); !isa(II); ++II) + Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); + return Size; +} + +/// Find out the latency of the longest dependence chain in the BB or the +/// dependence chain containing the compare instruction. +static unsigned FindDependenceChainLatency(BasicBlock *BB, + DenseMap &Instructions, + const TargetTransformInfo &TTI, + bool LongestChain) { + unsigned Max_latency = 0; + + BasicBlock::iterator II; + for (II = BB->begin(); !isa(II); ++II) { + unsigned Latency = 0; + for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { + Instruction *Op = dyn_cast(II->getOperand(O)); + if (Op && Instructions.count(Op)) { + auto Op_latency = Instructions[Op]; + if (Op_latency > Latency) + Latency = Op_latency; + } + } + Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); + Instructions[&(*II)] = Latency; + + if (Latency > Max_latency) + Max_latency = Latency; + } + + if (LongestChain) + return Max_latency; + + // The length of the dependence chain containing the compare instruction is + // wanted, so the terminator must be a BranchInst. + assert(isa(II)); + BranchInst* Br = cast(II); + Instruction *Cmp = dyn_cast(Br->getCondition()); + if (Cmp && Instructions.count(Cmp)) + return Instructions[Cmp]; + else + return 0; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + DenseMap &BB1_instructions) { + unsigned LastLatency = 0; + SmallVector Worklist; + BasicBlock::iterator II; + for (II = BB2->begin(); !isa(II); ++II) { + if (PHINode *PN = dyn_cast(II)) { + // Look for users in BB2. + bool InBBUser = false; + for (User *U : PN->users()) { + if (cast(U)->getParent() == BB2) { + InBBUser = true; + break; + } + } + // No such user, we don't care about this instruction and its operands. + if (!InBBUser) + break; + } + Worklist.push_back(&(*II)); + } + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { + if (Instruction *Op = dyn_cast(I->getOperand(O))) { + if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2) + Worklist.push_back(Op); + else if (Op->getParent() == BB1 && BB1_instructions.count(Op)) { + if (BB1_instructions[Op] > LastLatency) + LastLatency = BB1_instructions[Op]; + } + } + } + } + + return LastLatency; +} + +/// If after if conversion, most of the instructions in new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the data dependence is changed +/// into control dependence, and the long dependence chain is split into two, +/// the two parts can be executed in parallel on modern OOO processor. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + unsigned Speculation_size, + const TargetTransformInfo &TTI) { + // Accumulated latency of each instruction in their BBs. + DenseMap BB1_instructions; + DenseMap BB2_instructions; + + if (!TTI.isOutOfOrder()) + return false; + + unsigned New_BB_size = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) + + Speculation_size; + + // We check small BB only since it is more difficult to find unrelated + // instructions to fill functional units in small BB. + if (New_BB_size > 40) + return false; + + auto BB1_chain = + FindDependenceChainLatency(BB1, BB1_instructions, TTI, false); + auto BB2_chain = + FindDependenceChainLatency(BB2, BB2_instructions, TTI, true); + + // If we have a good ILP (IPC>=2) in new BB, then we don't care about the + // latency of the dependence chain. + if ((BB1_chain + BB2_chain) * 2 <= New_BB_size) + return false; + + // We only care about part of the dependence chain in BB1 that can be + // executed in parallel with BB2, so adjust the latency. + BB1_chain -= + LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1_instructions); + + // Correctly predicted branch instruction can skip the dependence chain in + // BB1, but misprediction has a penalty, so only when the dependence chain is + // longer than DependenceChainLatency, then branch is better than select. + if (BB1_chain >= DependenceChainLatency) + return true; + + return false; +} + /// Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -2048,6 +2195,11 @@ if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue)) return false; + // Don't do if conversion for long dependence chain. + if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, + CountBBCodeSize(ThenBB, TTI), TTI)) + return false; + // If we get here, we can hoist the instruction and if-convert. DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2355,6 +2507,10 @@ } } + if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, + AggressiveInsts.size(), TTI)) + return false; + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); Index: test/Transforms/SimplifyCFG/X86/if-conversion.ll =================================================================== --- test/Transforms/SimplifyCFG/X86/if-conversion.ll +++ test/Transforms/SimplifyCFG/X86/if-conversion.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s +; Avoid if conversion if there is a long dependence chain. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +define i64 @foo(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-NOT: select +} + +define i64 @bar(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = add i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @bar +; CHECK-NOT: select +} +