Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -635,6 +635,9 @@ /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return True if target can execute instructions out of order. + bool isOutOfOrder() const; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -1011,6 +1014,7 @@ Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; + virtual bool isOutOfOrder() const = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; @@ -1283,6 +1287,9 @@ Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } + bool isOutOfOrder() const override { + return Impl.isOutOfOrder(); + } unsigned getNumberOfRegisters(bool Vector) override { return Impl.getNumberOfRegisters(Vector); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -331,6 +331,8 @@ return TTI::TCC_Free; } + bool isOutOfOrder() const { return false; } + unsigned getNumberOfRegisters(bool Vector) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -394,6 +394,10 @@ return BaseT::getInstructionLatency(I); } + bool isOutOfOrder() const { + return getST()->getSchedModel().isOutOfOrder(); + } + /// @} /// \name Vector TTI Implementations Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -314,6 +314,10 @@ return Cost; } +bool TargetTransformInfo::isOutOfOrder() const { + return TTIImpl->isOutOfOrder(); +} + unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return TTIImpl->getNumberOfRegisters(Vector); } Index: lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- lib/Transforms/Utils/SimplifyCFG.cpp +++ lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,11 @@ cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt DependenceChainLatency( + "dependence-chain-latency", cl::Hidden, cl::init(8), + cl::desc("Limit the maximum latency of dependence chain containing cmp " + "for if conversion")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -395,6 +400,142 @@ return true; } +/// Estimate the code size of the specified BB. +static int CountBBCodeSize(BasicBlock *BB, const TargetTransformInfo &TTI) { + int size = 0; + for (auto II = BB->begin(); !isa(II); ++II) + size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); + return size; +} + +/// Find out the latency of the longest dependence chain in the BB or the +/// dependence chain containing the compare instruction. +static int FindDependenceChainLatency(BasicBlock *BB, + std::map &instructions, + const TargetTransformInfo &TTI, + bool LongestChain) { + int max_latency = 0; + + BasicBlock::iterator II; + for (II = BB->begin(); !isa(II); ++II) { + int latency = 0; + for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { + Instruction *op = dyn_cast(II->getOperand(O)); + if (op && instructions.count(op)) { + auto op_latency = instructions[op]; + if (op_latency > latency) + latency = op_latency; + } + } + latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); + instructions[&(*II)] = latency; + + if (latency > max_latency) + max_latency = latency; + } + + if (LongestChain) + return max_latency; + + BranchInst* br = dyn_cast(II); + return instructions[dyn_cast(br->getCondition())]; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static int LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + std::map &BB1_instructions) { + int LastLatency = 0; + SmallVector Worklist; + BasicBlock::iterator II; + for (II = BB2->begin(); !isa(II); ++II) { + PHINode *PN = dyn_cast(II); + if (PN) { + // Look for users in BB2. + bool InBBUser = false; + for (User *U : PN->users()) { + if (cast(U)->getParent() == BB2) { + InBBUser = true; + break; + } + } + // No such user, we don't care about this instruction and its operands. + if (!InBBUser) + break; + } + Worklist.push_back(&(*II)); + } + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { + Instruction *op = dyn_cast(I->getOperand(O)); + if (op) { + if (op->getParent() == IfBlock1 || op->getParent() == IfBlock2) + Worklist.push_back(op); + else if (op->getParent() == BB1 && BB1_instructions.count(op)) { + if (BB1_instructions[op] > LastLatency) + LastLatency = BB1_instructions[op]; + } + } + } + } + + return LastLatency; +} + +/// If after if conversion, most of the instructions in new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the data dependence is changed +/// into control dependence, and the long dependence chain is split into two, +/// the two parts can be executed in parallel on modern OOO processor. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + int speculation_size, + const TargetTransformInfo &TTI) { + // Accumulated latency of each instruction in their BBs. + std::map BB1_instructions; + std::map BB2_instructions; + + if (!TTI.isOutOfOrder()) + return false; + + int new_BB_size = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) + + speculation_size; + + // We check small BB only since it is more difficult to find unrelated + // instructions to fill functional units in small BB. + if (new_BB_size > 40) + return false; + + auto BB1_chain = + FindDependenceChainLatency(BB1, BB1_instructions, TTI, false); + auto BB2_chain = + FindDependenceChainLatency(BB2, BB2_instructions, TTI, true); + + // If we have a good ILP (IPC>=2) in new BB, then we don't care about the + // latency of the dependence chain. + if ((BB1_chain + BB2_chain) * 2 <= new_BB_size) + return false; + + // We only care about part of the dependence chain in BB1 that can be + // executed in parallel with BB2, so adjust the latency. + BB1_chain -= + LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1_instructions); + + // Correctly predicted branch instruction can skip the dependence chain in + // BB1, but misprediction has a penalty, so only when the dependence chain is + // longer than DependenceChainLatency, then branch is better than select. + if (BB1_chain >= DependenceChainLatency) + return true; + + return false; +} + /// Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -2023,6 +2164,11 @@ if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue)) return false; + // Don't do if conversion for long dependence chain. + if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, + CountBBCodeSize(ThenBB, TTI), TTI)) + return false; + // If we get here, we can hoist the instruction and if-convert. DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2324,6 +2470,10 @@ } } + if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, + AggressiveInsts.size(), TTI)) + return false; + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); Index: test/Transforms/SimplifyCFG/X86/if-conversion.ll =================================================================== --- test/Transforms/SimplifyCFG/X86/if-conversion.ll +++ test/Transforms/SimplifyCFG/X86/if-conversion.ll @@ -0,0 +1,56 @@ +; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s +; Avoid if conversion if there is a long dependence chain. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +define i64 @foo(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-NOT: select +} + +define i64 @bar(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = add i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @bar +; CHECK-NOT: select +} +