Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -114,6 +114,7 @@ void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstraintEliminationPass(PassRegistry &); +void initializeControlDependentDCEPass(PassRegistry&); void initializeControlHeightReductionLegacyPassPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); void initializeCostModelAnalysisPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -173,6 +173,7 @@ (void) llvm::createStripDeadPrototypesPass(); (void) llvm::createTailCallEliminationPass(); (void) llvm::createJumpThreadingPass(); + (void) llvm::createControlDependentDCEPass(); (void) llvm::createUnifyFunctionExitNodesPass(); (void) llvm::createInstCountPass(); (void) llvm::createConstantHoistingPass(); Index: llvm/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/include/llvm/Transforms/Scalar.h +++ llvm/include/llvm/Transforms/Scalar.h @@ -240,6 +240,8 @@ FunctionPass *createJumpThreadingPass(bool FreezeSelectCond = false, int Threshold = -1); +FunctionPass *createControlDependentDCEPass(int Threshold = -1); + //===----------------------------------------------------------------------===// // // CFGSimplification - Merge basic blocks, eliminate unreachable blocks, Index: llvm/include/llvm/Transforms/Scalar/ControlDependentDCE.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/ControlDependentDCE.h @@ -0,0 +1,134 @@ +//===- ControlDependentDCE.h - Control Dependent DCE ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// See the comments on ControlDependentDCEPass. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_CONTROLDEPENDENTDCE_H +#define LLVM_TRANSFORMS_SCALAR_CONTROLDEPENDENTDCE_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/IR/ValueHandle.h" +#include +#include + +namespace llvm { + +class BasicBlock; +class BinaryOperator; +class BranchInst; +class CmpInst; +class Constant; +class DomTreeUpdater; +class Function; +class Instruction; +class IntrinsicInst; +class LazyValueInfo; +class LoadInst; +class PHINode; +class TargetLibraryInfo; +class Value; + +/// A private "module" namespace for types and utilities used by +/// ControldependentdCE. +/// These are implementation details and should not be used by clients. +namespace controldependentdCE {} // end namespace controldependentdCE + +/// This pass performs 'control dependent DCE', which converts a conditional +/// branch of BB into an unconditional branch if possible, when the basic block +/// BBa dominates BB and both basic blocks have conditional branches under the +/// same condition. +/// +/// BBa: +/// %cmp = ... +/// br i1 %cmp, label %LA1, label %LA2 +/// +/// ... +/// +/// BB: +/// ... +/// br i1 %cmp, label %LD1, label %LD2 +/// +/// The condition to apply this optimization is that the predecessors of BB is +/// divided into the set S1 coming from LA1 and the set S2 coming from LA2. +/// If S1(or S2) is empty, it rewrites the conditional branch to an +/// unconditional branch to LD2(or LD1). +/// If both S1 and S2 are not empty and BB can be duplicated, BB is duplicated +/// for S2, and each conditional branches in BB and the duplicated BB are +/// converted into unconditional branches for LD1 and LD2, respectively. +class ControlDependentDCEPass : public PassInfoMixin { + TargetLibraryInfo *TLI; + LazyValueInfo *LVI; + AliasAnalysis *AA; + DomTreeUpdater *DTU; + std::unique_ptr BFI; + std::unique_ptr BPI; + bool HasProfileData = false; +#ifdef NDEBUG + SmallPtrSet LoopHeaders; +#else + SmallSet, 16> LoopHeaders; +#endif + + unsigned BBDupThreshold; + unsigned DefaultBBDupThreshold; + +public: + ControlDependentDCEPass(int T = -1); + + // Glue for old PM. + bool runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_, + AliasAnalysis *AA_, DomTreeUpdater *DTU_, bool HasProfileData_, + std::unique_ptr BFI_, + std::unique_ptr BPI_); + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + void releaseMemory() { + BFI.reset(); + BPI.reset(); + } + + void FindLoopHeaders(Function &F); + bool + processBBWithCommonDomCondition(Function &F, + SmallPtrSetImpl &Unreachable); + void UpdateSSA(BasicBlock *BB, BasicBlock *NewBB, + DenseMap &ValueMapping); + DenseMap CloneInstructions(BasicBlock::iterator BI, + BasicBlock::iterator BE, + BasicBlock *NewBB, + BasicBlock *PredBB); + bool TryThreadEdge(BasicBlock *BB, + const SmallVectorImpl &PredBBs, + BasicBlock *SuccBB); + void ThreadEdge(BasicBlock *BB, const SmallVectorImpl &PredBBs, + BasicBlock *SuccBB); + +private: + BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef Preds, + const char *Suffix); + void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, + BasicBlock *NewBB, BasicBlock *SuccBB); + /// Check if the block has profile metadata for its outgoing edges. + bool doesBlockHaveProfileData(BasicBlock *BB); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_CONTROLDEPENDENTDCE_H Index: llvm/lib/LTO/LTOCodeGenerator.cpp =================================================================== --- llvm/lib/LTO/LTOCodeGenerator.cpp +++ llvm/lib/LTO/LTOCodeGenerator.cpp @@ -137,6 +137,7 @@ initializeOpenMPOptLegacyPassPass(R); initializeArgPromotionPass(R); initializeJumpThreadingPass(R); + initializeControlDependentDCEPass(R); initializeSROALegacyPassPass(R); initializeAttributorLegacyPassPass(R); initializeAttributorCGSCCLegacyPassPass(R); Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -131,6 +131,7 @@ #include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/ConstraintElimination.h" +#include "llvm/Transforms/Scalar/ControlDependentDCE.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" #include "llvm/Transforms/Scalar/DeadStoreElimination.h" @@ -243,7 +244,11 @@ static cl::opt EnableGVNSink( "enable-npm-gvn-sink", cl::init(false), cl::Hidden, - cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); + cl::desc("Enable the GVN hoisting pass for the new PM (default = off)a")); + +static cl::opt EnableControlDependentDCE( + "enable-npm-control-dependent-dce", cl::init(false), cl::Hidden, + cl::desc("Enable Optimization using GVN result (default = off)")); static cl::opt EnableUnrollAndJam( "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden, @@ -616,8 +621,10 @@ // Optimize based on known information about branches, and cleanup afterward. FPM.addPass(JumpThreadingPass()); + if (EnableGVNHoist || EnableGVNSink) + if (EnableControlDependentDCE) + FPM.addPass(ControlDependentDCEPass()); FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(SimplifyCFGPass()); if (Level == OptimizationLevel::O3) FPM.addPass(AggressiveInstCombinePass()); @@ -732,6 +739,8 @@ // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. FPM.addPass(JumpThreadingPass()); + if (EnableControlDependentDCE) + FPM.addPass(ControlDependentDCEPass()); FPM.addPass(CorrelatedValuePropagationPass()); FPM.addPass(DSEPass()); FPM.addPass(createFunctionToLoopPassAdaptor( @@ -1594,6 +1603,8 @@ MainFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(MainFPM, Level); MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); + if (EnableControlDependentDCE) + MainFPM.addPass(ControlDependentDCEPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); // Create a function that performs CFI checks for cross-DSO calls with Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -236,6 +236,7 @@ FUNCTION_PASS("nary-reassociate", NaryReassociatePass()) FUNCTION_PASS("newgvn", NewGVNPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) +FUNCTION_PASS("control-dependent-dce", ControlDependentDCEPass()) FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass()) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass()) Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -134,6 +134,10 @@ "enable-gvn-sink", cl::init(false), cl::ZeroOrMore, cl::desc("Enable the GVN sinking pass (default = off)")); +static cl::opt EnableControlDependentDCE( + "enable-control-dependent-dce", cl::init(false), cl::Hidden, + cl::desc("Enable Optimization using GVN result (default = off)")); + // This option is used in simplifying testing SampleFDO optimizations for // profile loading. static cl::opt @@ -394,6 +398,9 @@ MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); MPM.add(createJumpThreadingPass()); // Thread jumps. + if (EnableGVNHoist || EnableGVNSink) + if (EnableControlDependentDCE) + MPM.add(createControlDependentDCEPass()); MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals } MPM.add(createCFGSimplificationPass()); // Merge & remove BBs @@ -470,6 +477,8 @@ addExtensionsToPM(EP_Peephole, MPM); if (OptLevel > 1) { MPM.add(createJumpThreadingPass()); // Thread jumps + if (EnableControlDependentDCE) + MPM.add(createControlDependentDCEPass()); MPM.add(createCorrelatedValuePropagationPass()); MPM.add(createDeadStoreEliminationPass()); // Delete dead stores MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); @@ -1070,6 +1079,8 @@ addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true)); + if (EnableControlDependentDCE) + PM.add(createControlDependentDCEPass()); } void PassManagerBuilder::addLateLTOOptimizationPasses( Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -22,6 +22,7 @@ InferAddressSpaces.cpp InstSimplifyPass.cpp JumpThreading.cpp + ControlDependentDCE.cpp LICM.cpp LoopAccessAnalysisPrinter.cpp LoopSink.cpp Index: llvm/lib/Transforms/Scalar/ControlDependentDCE.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/ControlDependentDCE.cpp @@ -0,0 +1,983 @@ +//===- ControlDependentDCE.cpp - Control Dependent DCE -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the Control Dependent DCE pass. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/ControlDependentDCE.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/GuardUtils.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace controldependentdCE; + +#define DEBUG_TYPE "control-dependent-dce" + +STATISTIC(NumThreads, "Number of jumps threaded"); + +static cl::opt BBDuplicateThreshold( + "control-dependent-dce-threshold", + cl::desc("Max block size to duplicate for control dependent DCE"), + cl::init(6), cl::Hidden); + +static cl::opt PrintLVIAfterControlDependentDCE( + "print-lvi-after-control-dependent-dce", + cl::desc("Print the LazyValueInfo cache after ControlDependentDCE"), + cl::init(false), cl::Hidden); + +static cl::opt ThreadAcrossLoopHeaders( + "control-dependent-dce-across-loop-headers", + cl::desc( + "Allow ControlDependentDCE to thread across loop headers, for testing"), + cl::init(false), cl::Hidden); + +namespace { + +/// This pass performs 'control dependent DCE'. +class ControlDependentDCE : public FunctionPass { + ControlDependentDCEPass Impl; + +public: + static char ID; // Pass identification + + ControlDependentDCE(int T = -1) : FunctionPass(ID), Impl(T) { + initializeControlDependentDCEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + } + + void releaseMemory() override { Impl.releaseMemory(); } +}; + +} // end anonymous namespace + +char ControlDependentDCE::ID = 0; + +INITIALIZE_PASS_BEGIN(ControlDependentDCE, "control-dependent-dce", + "Control Dependent DCE", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(ControlDependentDCE, "control-dependent-dce", + "Control Dependent DCE", false, false) + +// Public interface to the Control Dependent DCE pass +FunctionPass *llvm::createControlDependentDCEPass(int Threshold) { + return new ControlDependentDCE(Threshold); +} + +ControlDependentDCEPass::ControlDependentDCEPass(int T) { + DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); +} + +/// runOnFunction - Toplevel algorithm. +bool ControlDependentDCE::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + auto TLI = &getAnalysis().getTLI(F); + auto DT = &getAnalysis().getDomTree(); + auto LVI = &getAnalysis().getLVI(); + auto AA = &getAnalysis().getAAResults(); + DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); + std::unique_ptr BFI; + std::unique_ptr BPI; + if (F.hasProfileData()) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } + + bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), + std::move(BFI), std::move(BPI)); + if (PrintLVIAfterControlDependentDCE) { + dbgs() << "LVI for function '" << F.getName() << "':\n"; + LVI->printLVI(F, DTU.getDomTree(), dbgs()); + } + return Changed; +} + +PreservedAnalyses ControlDependentDCEPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &LVI = AM.getResult(F); + auto &AA = AM.getResult(F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + std::unique_ptr BFI; + std::unique_ptr BPI; + if (F.hasProfileData()) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI, &TLI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } + + bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), + std::move(BFI), std::move(BPI)); + + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} + +bool ControlDependentDCEPass::runImpl( + Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_, + AliasAnalysis *AA_, DomTreeUpdater *DTU_, bool HasProfileData_, + std::unique_ptr BFI_, + std::unique_ptr BPI_) { + LLVM_DEBUG(dbgs() << "Control Dependent DCE on function '" << F.getName() + << "'\n"); + TLI = TLI_; + LVI = LVI_; + AA = AA_; + DTU = DTU_; + BFI.reset(); + BPI.reset(); + // When profile data is available, we need to update edge weights after + // successful control dependent DCE, which requires both BPI and BFI being + // available. + HasProfileData = HasProfileData_; + if (HasProfileData) { + BPI = std::move(BPI_); + BFI = std::move(BFI_); + } + + // Reduce the number of instructions duplicated when optimizing strictly for + // size. + if (BBDuplicateThreshold.getNumOccurrences()) + BBDupThreshold = BBDuplicateThreshold; + else if (F.hasFnAttribute(Attribute::MinSize)) + BBDupThreshold = 3; + else + BBDupThreshold = DefaultBBDupThreshold; + + // ControlDependentDCE must not processes blocks unreachable from entry. It's + // a waste of compute time and can potentially lead to hangs. + SmallPtrSet Unreachable; + assert(DTU && "DTU isn't passed into ControlDependentDCE before using it."); + assert(DTU->hasDomTree() && + "ControlDependentDCE relies on DomTree to proceed."); + DominatorTree &DT = DTU->getDomTree(); + for (auto &BB : F) + if (!DT.isReachableFromEntry(&BB)) + Unreachable.insert(&BB); + + if (!ThreadAcrossLoopHeaders) + FindLoopHeaders(F); + + bool EverChanged = false; + bool Changed; + do { + Changed = false; + if (processBBWithCommonDomCondition(F, Unreachable)) { + Changed = true; + DominatorTree &DT = DTU->getDomTree(); + for (auto &BB : F) + if (!Unreachable.count(&BB) && !DT.isReachableFromEntry(&BB)) { + LLVM_DEBUG(dbgs() + << " CDDCE: Deleting dead block '" << BB.getName() + << "' with terminator: " << *BB.getTerminator() << '\n'); + LoopHeaders.erase(&BB); + LVI->eraseBlock(&BB); + DeleteDeadBlock(&BB, DTU); + } + } + EverChanged |= Changed; + } while (Changed); + + LoopHeaders.clear(); + return EverChanged; +} + +/// Return the cost of duplicating a piece of this block from first non-phi +/// and before StopAt instruction to thread across it. Stop scanning the block +/// when exceeding the threshold. If duplication is impossible, returns ~0U. +static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, + Instruction *StopAt, + unsigned Threshold) { + assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); + /// Ignore PHI nodes, these will be flattened when duplication happens. + BasicBlock::const_iterator I(BB->getFirstNonPHI()); + + // FIXME: THREADING will delete values that are just used to compute the + // branch, so they shouldn't count against the duplication cost. + + unsigned Bonus = 0; + if (BB->getTerminator() == StopAt) { + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to + // happen. + if (isa(StopAt)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa(StopAt)) + Bonus = 8; + } + + // Bump the threshold up so the early exit from the loop doesn't skip the + // terminator-based Size adjustment at the end. + Threshold += Bonus; + + // Sum up the cost of each instruction until we get to the terminator. Don't + // include the terminator because the copy won't include it. + unsigned Size = 0; + for (; &*I != StopAt; ++I) { + + // Stop scanning the block if we've reached the threshold. + if (Size > Threshold) + return Size; + + // Debugger intrinsics don't incur code size. + if (isa(I)) + continue; + + // If this is a pointer->pointer bitcast, it is free. + if (isa(I) && I->getType()->isPointerTy()) + continue; + + // Freeze instruction is free, too. + if (isa(I)) + continue; + + // Bail out if this instruction gives back a token type, it is not possible + // to duplicate it if it is used outside this BB. + if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) + return ~0U; + + // All other instructions count for at least one unit. + ++Size; + + // Calls are more expensive. If they are non-intrinsic calls, we model them + // as having cost of 4. If they are a non-vector intrinsic, we model them + // as having cost of 2 total, and if they are a vector intrinsic, we model + // them as having cost 1. + if (const CallInst *CI = dyn_cast(I)) { + if (CI->cannotDuplicate() || CI->isConvergent()) + // Blocks with NoDuplicate are modelled as having infinite cost, so they + // are never duplicated. + return ~0U; + else if (!isa(CI)) + Size += 3; + else if (!CI->getType()->isVectorTy()) + Size += 1; + } + } + + return Size > Bonus ? Size - Bonus : 0; +} + +/// FindLoopHeaders - We do not want jump threading to turn proper loop +/// structures into irreducible loops. Doing this breaks up the loop nesting +/// hierarchy and pessimizes later transformations. To prevent this from +/// happening, we first have to find the loop headers. Here we approximate this +/// by finding targets of backedges in the CFG. +/// +/// Note that there definitely are cases when we want to allow threading of +/// edges across a loop header. For example, threading a jump from outside the +/// loop (the preheader) to an exit block of the loop is definitely profitable. +/// It is also almost always profitable to thread backedges from within the loop +/// to exit blocks, and is often profitable to thread backedges to other blocks +/// within the loop (forming a nested loop). This simple analysis is not rich +/// enough to track all of these properties and keep it up-to-date as the CFG +/// mutates, so we don't allow any of these transformations. +void ControlDependentDCEPass::FindLoopHeaders(Function &F) { + SmallVector, 32> Edges; + FindFunctionBackedges(F, Edges); + + for (const auto &Edge : Edges) + LoopHeaders.insert(Edge.second); +} + +/// collectReachableBBSet - Create a set of reachable basic blocks from BB in +/// the reverse direction of the control flow as TouchBBSet. +/// When reaching BBa, search is stopped there. +static void collectReachableBBSet(BasicBlock *BB, BasicBlock *BBa, + DenseSet &TouchBBSet) { + if (TouchBBSet.count(BB)) + return; + TouchBBSet.insert(BB); + for (BasicBlock *Pred : predecessors(BB)) { + if (Pred != BBa) { + collectReachableBBSet(Pred, BBa, TouchBBSet); + } + } +} + +/// findPredSourceEdge - Classify the predecessor Pred of BB into the following +/// categories: +enum PredOrigin : unsigned { + PredFromSuccBB0, // from SuccBB0 + PredFromSuccBB1, // from SuccBB1 + PredFromBothSuccBB0AndSuccBB1, // from both SuccBB0 and SuccBB1 + PredUnreachable, // unreachable +}; +/// BBa is the basic block which dominates BB. +/// Both SuccBB0 and SuccBB1 are the successors of BBa. +static enum PredOrigin findPredSourceEdge(BasicBlock *BBa, BasicBlock *BB, + BasicBlock *Pred, BasicBlock *SuccBB0, + BasicBlock *SuccBB1) { + if (Pred == BBa) { + if (BB == SuccBB0) + return PredFromSuccBB0; + if (BB == SuccBB1) + return PredFromSuccBB1; + llvm_unreachable("This cannot happen!"); + } + DenseSet TouchBBSet; + collectReachableBBSet(Pred, BBa, TouchBBSet); + bool ReachSuccBB0 = TouchBBSet.count(SuccBB0); + bool ReachSuccBB1 = TouchBBSet.count(SuccBB1); + if (ReachSuccBB0 && ReachSuccBB1) + return PredFromBothSuccBB0AndSuccBB1; + if (ReachSuccBB0) + return PredFromSuccBB0; + if (ReachSuccBB1) + return PredFromSuccBB1; + return PredUnreachable; +} + +/// findBBWithCommonDomCondition - Detect BBa that dominates BB and has the same +/// condition code C as BB. BBList is sorted in decreasing order of levels on +/// the dominating tree with the condition code C. The position of BB in BBList +/// is StartIdx - 1. +/// If there was BBa, then return it and also the set of predecessor of BB is +/// divided into Preds0 and Preds1. +static BasicBlock *findBBWithCommonDomCondition( + DominatorTree &DT, SmallVectorImpl &BBList, BasicBlock *BB, + unsigned StartIdx, SmallVectorImpl &Preds0, + SmallVectorImpl &Preds1) { + size_t Size = BBList.size(); + for (unsigned Idx = StartIdx; Idx < Size; ++Idx) { + BasicBlock *BBa = BBList[Idx]; + if (!DT.dominates(BBa, BB)) + continue; + BranchInst *CondBr = dyn_cast(BBa->getTerminator()); + BasicBlock *SuccBB0 = CondBr->getSuccessor(0); + BasicBlock *SuccBB1 = CondBr->getSuccessor(1); + if (SuccBB0 == SuccBB1 || BBa == SuccBB0 || BBa == SuccBB1) + continue; + SmallVector Preds(predecessors(BB)); + SmallVector PredsBoth; + for (auto Pred : Preds) { + enum PredOrigin Src = PredFromBothSuccBB0AndSuccBB1; + if (Pred == BBa) { + if (BB == SuccBB0) + Src = PredFromSuccBB0; + else if (BB == SuccBB1) + Src = PredFromSuccBB1; + else + llvm_unreachable("This cannot happen!"); + } else if (Pred == BB) + Src = PredFromBothSuccBB0AndSuccBB1; + else + Src = findPredSourceEdge(BBa, BB, Pred, SuccBB0, SuccBB1); + switch (Src) { + case PredFromSuccBB0: + Preds0.push_back(Pred); + break; + case PredFromSuccBB1: + Preds1.push_back(Pred); + break; + case PredFromBothSuccBB0AndSuccBB1: + PredsBoth.push_back(Pred); + break; + case PredUnreachable: + break; + } + } + return PredsBoth.empty() ? BBa : nullptr; + } + return nullptr; +} + +/// conditionalToUnconditional - Change BBd to the basic block of unconditional +/// branching to Ks. +enum KeepSide : unsigned { KeepThen = 0, KeepElse = 1 }; +static BasicBlock *conditionalToUnconditional(DomTreeUpdater *DTU, + BasicBlock *BBd, + enum KeepSide Ks) { + BranchInst *CondBr = dyn_cast(BBd->getTerminator()); + BasicBlock *ToRemoveSucc = CondBr->getSuccessor(!Ks); + BasicBlock *ToKeepSucc = CondBr->getSuccessor(Ks); + ToRemoveSucc->removePredecessor(BBd); + BranchInst *NewBI = BranchInst::Create(ToKeepSucc, CondBr); + NewBI->setDebugLoc(ToKeepSucc->getTerminator()->getDebugLoc()); + CondBr->eraseFromParent(); + DTU->applyUpdatesPermissive({{DominatorTree::Delete, BBd, ToRemoveSucc}}); + return ToRemoveSucc; +} + +/// processBBWithCommonDomCondition - Convert conditional branches into +/// unconditional branches using common dominator conditions. +bool ControlDependentDCEPass::processBBWithCommonDomCondition( + Function &F, SmallPtrSetImpl &Unreachable) { + DominatorTree &DT = DTU->getDomTree(); + // Create a CtoBBList representing the correspondence between a condition code + // and the list of basic blocks that use the condition code. + MapVector> CtoBBList; + for (auto &BB : F) { + if (Unreachable.count(&BB)) + continue; + Instruction *Terminator = BB.getTerminator(); + if (BranchInst *BI = dyn_cast(Terminator)) { + if (BI->isConditional()) { + Value *Condition = BI->getCondition(); + CtoBBList[Condition].push_back(&BB); + } + } + } + + // NumUbs represents the number of BB which is converted conditional branch + // into unconditional branch. NumDups represents the number of BB which is + // converted conditional branch into unconditional branch by duplicating BB. + unsigned NumUbs = 0; + unsigned NumDups = 0; + bool Changed = false; + for (auto &Pair : CtoBBList) { + bool AdditionalUnreachables = false; + SmallVectorImpl &BBList = Pair.second; + // If the number of basic blocks using the condition code is less than 2, + // optimization is not performed. + if (BBList.size() < 2) + continue; + // When there are multiple basic blocks that use the condition code, + // optimization is performed in decreasing order of the level of the + // dominator tree. + std::stable_sort( + BBList.begin(), BBList.end(), [&DT](BasicBlock *X, BasicBlock *Y) { + return DT.getNode(X)->getLevel() > DT.getNode(Y)->getLevel(); + }); + unsigned Idx = 0; + for (auto *BB : BBList) { + SmallVector Preds0; + SmallVector Preds1; + ++Idx; + // If the predecessors of BB can not be completely divided into Preds0 and + // Preds1, ignore BB. Where only the true value of the condition code used + // by BB reaches Preds0, and only the false value reaches Preds1, + // respectively. + if (!findBBWithCommonDomCondition(DT, BBList, BB, Idx, Preds0, Preds1)) + continue; + bool Preds0Empty = Preds0.empty(); + bool Preds1Empty = Preds1.empty(); + // If BB is unreachable, ignore it. + if (Preds0Empty && Preds1Empty) + continue; + // If the value of the condition code reaching BB is only true or only + // false, convert the conditional branch of BB into an unconditional + // branch. + if (Preds0Empty || Preds1Empty) { + enum KeepSide Ks = Preds0Empty ? KeepElse : KeepThen; + BasicBlock *ToRemoveSucc = conditionalToUnconditional(DTU, BB, Ks); + ++NumUbs; + if (pred_empty(ToRemoveSucc)) + AdditionalUnreachables = true; + Changed = true; + continue; + } + // When BB is a loop header, ignore it in order not to make an irreducible + // loop. + if (LoopHeaders.count(BB)) + continue; + BranchInst *CondBr = dyn_cast(BB->getTerminator()); + BasicBlock *SuccBB0 = CondBr->getSuccessor(0); + BasicBlock *SuccBB1 = CondBr->getSuccessor(1); + // When BB is substantially an unconditional branch, ignore it. + if (SuccBB0 == SuccBB1) + continue; + // When BB is a loop having an edge to itself, ignore it in order not to + // make an irreducible loop. + if (SuccBB0 == BB || SuccBB1 == BB) + continue; + // If BB can be duplicated according to ThreadEdge, duplicate BB and + // convert conditional branches of both BB and duplicated BB to + // unconditional branches. + if (TryThreadEdge(BB, Preds1, SuccBB1)) { + SuccBB1->removePredecessor(BB); + BranchInst *NewBI = BranchInst::Create(SuccBB0, CondBr); + NewBI->setDebugLoc(SuccBB0->getTerminator()->getDebugLoc()); + CondBr->eraseFromParent(); + DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, SuccBB1}}); + ++NumDups; + Changed = true; + } + } + // If new unreachable basic blocks occur additionally, re-execute the whole + // thing before processing the next condition code. + if (AdditionalUnreachables) + break; + } + LLVM_DEBUG( + dbgs() << "CDDCE: Number of BBs processed: #NumUbs=" << NumUbs + << " #NumDups=" << NumDups << "\n"); + return Changed; +} + +/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new +/// predecessor to the PHIBB block. If it has PHI nodes, add entries for +/// NewPred using the entries from OldPred (suitably mapped). +static void +AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, BasicBlock *OldPred, + BasicBlock *NewPred, + DenseMap &ValueMap) { + for (PHINode &PN : PHIBB->phis()) { + // Ok, we have a PHI node. Figure out what the incoming value was for the + // DestBlock. + Value *IV = PN.getIncomingValueForBlock(OldPred); + + // Remap the value if necessary. + if (Instruction *Inst = dyn_cast(IV)) { + DenseMap::iterator I = ValueMap.find(Inst); + if (I != ValueMap.end()) + IV = I->second; + } + + PN.addIncoming(IV, NewPred); + } +} + +/// Update the SSA form. NewBB contains instructions that are copied from BB. +/// ValueMapping maps old values in BB to new ones in NewBB. +void ControlDependentDCEPass::UpdateSSA( + BasicBlock *BB, BasicBlock *NewBB, + DenseMap &ValueMapping) { + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector UsesToRename; + + for (Instruction &I : *BB) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Use &U : I.uses()) { + Instruction *User = cast(U.getUser()); + if (PHINode *UserPN = dyn_cast(User)) { + if (UserPN->getIncomingBlock(U) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&U); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + LLVM_DEBUG(dbgs() << "\n"); + } +} + +/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone +/// arguments that come from PredBB. Return the map from the variables in the +/// source basic block to the variables in the newly created basic block. +DenseMap ControlDependentDCEPass::CloneInstructions( + BasicBlock::iterator BI, BasicBlock::iterator BE, BasicBlock *NewBB, + BasicBlock *PredBB) { + // We are going to have to map operands from the source basic block to the new + // copy of the block 'NewBB'. If there are PHI nodes in the source basic + // block, evaluate them to account for entry from PredBB. + DenseMap ValueMapping; + + // Clone the phi nodes of the source basic block into NewBB. The resulting + // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater + // might need to rewrite the operand of the cloned phi. + for (; PHINode *PN = dyn_cast(BI); ++BI) { + PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); + NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); + ValueMapping[PN] = NewPN; + } + + // Clone the non-phi instructions of the source basic block into NewBB, + // keeping track of the mapping and using it to remap operands in the cloned + // instructions. + for (; BI != BE; ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[&*BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast(New->getOperand(i))) { + DenseMap::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + return ValueMapping; +} + +/// TryThreadEdge - Thread an edge if it's safe and profitable to do so. +bool ControlDependentDCEPass::TryThreadEdge( + BasicBlock *BB, const SmallVectorImpl &PredBBs, + BasicBlock *SuccBB) { + // If threading to the same block as we come from, we would infinite loop. + if (SuccBB == BB) { + LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() + << "' - would thread to self!\n"); + return false; + } + + // If threading this would thread across a loop header, don't thread the edge. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) { + LLVM_DEBUG({ + bool BBIsHeader = LoopHeaders.count(BB); + bool SuccIsHeader = LoopHeaders.count(SuccBB); + dbgs() << " Not threading across " + << (BBIsHeader ? "loop header BB '" : "block BB '") + << BB->getName() << "' to dest " + << (SuccIsHeader ? "loop header BB '" : "block BB '") + << SuccBB->getName() + << "' - it might create an irreducible loop!\n"; + }); + return false; + } + + unsigned JumpThreadCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); + if (JumpThreadCost > BBDupThreshold) { + LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); + return false; + } + + ThreadEdge(BB, PredBBs, SuccBB); + return true; +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +void ControlDependentDCEPass::ThreadEdge( + BasicBlock *BB, const SmallVectorImpl &PredBBs, + BasicBlock *SuccBB) { + assert(SuccBB != BB && "Don't create an infinite loop"); + + assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && + "Don't thread across loop headers"); + + // And finally, do it! Start by factoring the predecessors if needed. + BasicBlock *PredBB; + if (PredBBs.size() == 1) + PredBB = PredBBs[0]; + else { + LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); + } + + // And finally, do it! + LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() + << "' to '" << SuccBB->getName() << ", across block:\n " + << *BB << "\n"); + + LVI->threadEdge(PredBB, BB, SuccBB); + + BasicBlock *NewBB = BasicBlock::Create( + BB->getContext(), BB->getName() + ".thread", BB->getParent(), BB); + NewBB->moveAfter(PredBB); + + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = + BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + + // Copy all the instructions from BB to NewBB except the terminator. + DenseMap ValueMapping = + CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); + + // We didn't copy the terminator from BB over to NewBB, because there is now + // an unconditional jump to SuccBB. Insert the unconditional jump. + BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); + NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); + + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the + // PHI nodes for NewBB now. + AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + + // Update the terminator of PredBB to jump to NewBB instead of BB. This + // eliminates predecessors from BB, which requires us to simplify any PHI + // nodes in BB. + Instruction *PredTerm = PredBB->getTerminator(); + for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) + if (PredTerm->getSuccessor(i) == BB) { + BB->removePredecessor(PredBB, true); + PredTerm->setSuccessor(i, NewBB); + } + + // Enqueue required DT updates. + DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB}, + {DominatorTree::Insert, PredBB, NewBB}, + {DominatorTree::Delete, PredBB, BB}}); + + UpdateSSA(BB, NewBB, ValueMapping); + + // At this point, the IR is fully up to date and consistent. Do a quick scan + // over the new instructions and zap any that are constants or dead. This + // frequently happens because of phi translation. + SimplifyInstructionsInBlock(NewBB, TLI); + + // Update the edge weight from BB to SuccBB, which should be less than before. + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + + // Threaded an edge! + ++NumThreads; +} + +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is available, update the frequency of +/// this new block. +BasicBlock *ControlDependentDCEPass::SplitBlockPreds( + BasicBlock *BB, ArrayRef Preds, const char *Suffix) { + SmallVector NewBBs; + + // Collect the frequencies of all predecessors of BB, which will be used to + // update the edge weight of the result of splitting predecessors. + DenseMap FreqMap; + if (HasProfileData) + for (auto Pred : Preds) + FreqMap.insert(std::make_pair( + Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB))); + + // In the case when BB is a LandingPad block we create 2 new predecessors + // instead of just one. + if (BB->isLandingPad()) { + std::string NewName = std::string(Suffix) + ".split-lp"; + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs); + } else { + NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix)); + } + + std::vector Updates; + Updates.reserve((2 * Preds.size()) + NewBBs.size()); + for (auto NewBB : NewBBs) { + BlockFrequency NewBBFreq(0); + Updates.push_back({DominatorTree::Insert, NewBB, BB}); + for (auto Pred : predecessors(NewBB)) { + Updates.push_back({DominatorTree::Delete, Pred, BB}); + Updates.push_back({DominatorTree::Insert, Pred, NewBB}); + if (HasProfileData) // Update frequencies between Pred -> NewBB. + NewBBFreq += FreqMap.lookup(Pred); + } + if (HasProfileData) // Apply the summed frequency to NewBB. + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + + DTU->applyUpdatesPermissive(Updates); + return NewBBs[0]; +} + +bool ControlDependentDCEPass::doesBlockHaveProfileData(BasicBlock *BB) { + const Instruction *TI = BB->getTerminator(); + assert(TI->getNumSuccessors() > 1 && "not a split"); + + MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof); + if (!WeightsNode) + return false; + + MDString *MDName = cast(WeightsNode->getOperand(0)); + if (MDName->getString() != "branch_weights") + return false; + + // Ensure there are weights for all of the successors. Note that the first + // operand to the metadata node is a name, not a weight. + return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void ControlDependentDCEPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { + if (!HasProfileData) + return; + + assert(BFI && BPI && "BFI & BPI should have been created here"); + + // As the edge from PredBB to BB is deleted, we have to update the block + // frequency of BB. + auto BBOrigFreq = BFI->getBlockFreq(BB); + auto NewBBFreq = BFI->getBlockFreq(NewBB); + auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); + auto BBNewFreq = BBOrigFreq - NewBBFreq; + BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + + // Collect updated outgoing edges' frequencies from BB and use them to update + // edge probabilities. + SmallVector BBSuccFreq; + for (BasicBlock *Succ : successors(BB)) { + auto SuccFreq = (Succ == SuccBB) + ? BB2SuccBBFreq - NewBBFreq + : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); + BBSuccFreq.push_back(SuccFreq.getFrequency()); + } + + uint64_t MaxBBSuccFreq = + *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + + SmallVector BBSuccProbs; + if (MaxBBSuccFreq == 0) + BBSuccProbs.assign(BBSuccFreq.size(), + {1, static_cast(BBSuccFreq.size())}); + else { + for (uint64_t Freq : BBSuccFreq) + BBSuccProbs.push_back( + BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); + // Normalize edge probabilities so that they sum up to one. + BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), + BBSuccProbs.end()); + } + + // Update edge probabilities in BPI. + BPI->setEdgeProbability(BB, BBSuccProbs); + + // Update the profile metadata as well. + // + // Don't do this if the profile of the transformed blocks was statically + // estimated. (This could occur despite the function having an entry + // frequency in completely cold parts of the CFG.) + // + // In this case we don't want to suggest to subsequent passes that the + // calculated weights are fully consistent. Consider this graph: + // + // check_1 + // 50% / | + // eq_1 | 50% + // \ | + // check_2 + // 50% / | + // eq_2 | 50% + // \ | + // check_3 + // 50% / | + // eq_3 | 50% + // \ | + // + // Assuming the blocks check_* all compare the same value against 1, 2 and 3, + // the overall probabilities are inconsistent; the total probability that the + // value is either 1, 2 or 3 is 150%. + // + // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3 + // becomes 0%. This is even worse if the edge whose probability becomes 0% is + // the loop exit edge. Then based solely on static estimation we would assume + // the loop was extremely hot. + // + // FIXME this locally as well so that BPI and BFI are consistent as well. We + // shouldn't make edges extremely likely or unlikely based solely on static + // estimation. + if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) { + SmallVector Weights; + for (auto Prob : BBSuccProbs) + Weights.push_back(Prob.getNumerator()); + + auto TI = BB->getTerminator(); + TI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + } +} Index: llvm/lib/Transforms/Scalar/Scalar.cpp =================================================================== --- llvm/lib/Transforms/Scalar/Scalar.cpp +++ llvm/lib/Transforms/Scalar/Scalar.cpp @@ -59,6 +59,7 @@ initializeInferAddressSpacesPass(Registry); initializeInstSimplifyLegacyPassPass(Registry); initializeJumpThreadingPass(Registry); + initializeControlDependentDCEPass(Registry); initializeLegacyLICMPassPass(Registry); initializeLegacyLoopSinkPassPass(Registry); initializeLoopFuseLegacyPass(Registry); @@ -174,6 +175,10 @@ unwrap(PM)->add(createJumpThreadingPass()); } +void LLVMAddControlDependentDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createControlDependentDCEPass()); +} + void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopSinkPass()); } Index: llvm/test/Transforms/ControlDependentDCE/conditional-to-unconditional.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/ControlDependentDCE/conditional-to-unconditional.ll @@ -0,0 +1,87 @@ +; RUN: opt -control-dependent-dce -S < %s | FileCheck %s + +declare void @show1() +declare void @show2() +declare void @show3() +declare void @show4() + +@c = external global i32, align 4 + +define void @check1(i32 %x, i32 %y) { +; CHECK-LABEL: @check1( +; CHECK: br i1 %cmp, +; CHECK-NOT: br i1 %cmp, +; CHECK-NOT: if.else: +; Check that control-dependent-dce is able to infer the value of %cmp +; used at the end of basic block if.then and thus that the IR after +; control-dependent-dce contains only one conditional branch based on +; %cmp in basic block entry. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.end3 + +if.then: ; preds = %entry + call void () @show1() + br i1 %cmp, label %if.then2, label %if.else + +if.then2: ; preds = %if.then + call void () @show2() + br label %if.end + +if.else: ; preds = %if.then + call void () @show3() + br label %if.end + +if.end: ; preds = %if.else, %if.then2 + br label %if.end3 + +if.end3: ; preds = %if.end, %entry + ret void +} + +define void @check2(i32 %x, i32 %y) { +; CHECK-LABEL: @check2( +; CHECK: br i1 %cmp, +; CHECK-NOT: br i1 %cmp, +; CHECK-NOT: if.else: +; Check that control-dependent-dce is able to infer the value of %cmp +; used at the end of basic block do.body in a loop and thus that the +; IR after control-dependent-dce contains only one conditional branch +; based on %cmp in basic block entry. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.end3 + +if.then: ; preds = %entry + call void () @show1() + br label %do.body + +do.body: ; preds = %do.cond, %if.then + call void () @show2() + br i1 %cmp, label %if.then2, label %if.else + +if.then2: ; preds = %do.body + call void () @show3() + br label %if.end + +if.else: ; preds = %do.body + call void () @show4() + br label %if.end + +if.end: ; preds = %if.else, %if.then2 + %0 = load i32, i32* @c, align 4 + %dec = add nsw i32 %0, -1 + store i32 %dec, i32* @c, align 4 + br label %do.cond + +do.cond: ; preds = %if.end + %1 = load i32, i32* @c, align 4 + %tobool = icmp ne i32 %1, 0 + br i1 %tobool, label %do.body, label %do.end + +do.end: ; preds = %do.cond + br label %if.end3 + +if.end3: ; preds = %do.end, %entry + ret void +} Index: llvm/test/Transforms/ControlDependentDCE/duplicate-conditional.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/ControlDependentDCE/duplicate-conditional.ll @@ -0,0 +1,234 @@ +; RUN: opt -control-dependent-dce -S < %s | FileCheck %s + +declare void @show1() +declare void @show2() +declare void @show3() +declare void @show4() +declare void @show5() +declare void @show6() +declare void @show7() + +@E = external global i32, align 4 + +define void @check1(i32 %x, i32 %y) { +; CHECK-LABEL: @check1( +; CHECK: br i1 %cmp, +; CHECK-NOT: br i1 %cmp, +; Check that control-dependent-dce is able to classify the +; predecessors by the value of %cmp used at the end of basic block +; if.end and thus that the IR after control-dependent-dce contains both +; if.end and a duplicate of if.end, which don't use %cmp. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %do.body + +do.body: ; preds = %do.body, %if.then + %i.0 = phi i32 [ 0, %if.then ], [ %inc, %do.body ] + call void () @show1() + %inc = add nsw i32 %i.0, 1 + %0 = load i32, i32* @E, align 4 + %cmp1 = icmp slt i32 %inc, %0 + br i1 %cmp1, label %do.body, label %do.end + +do.end: ; preds = %do.body + br label %if.end + +if.else: ; preds = %entry + call void () @show2() + br label %if.end + +if.end: ; preds = %if.else, %do.end + call void () @show3() + br i1 %cmp, label %if.then3, label %if.else4 + +if.then3: ; preds = %if.end + call void () @show4() + br label %if.end5 + +if.else4: ; preds = %if.end + call void () @show5() + br label %if.end5 + +if.end5: ; preds = %if.else4, %if.then3 + ret void +} + +define void @check2(i32 %x, i32 %y) { +; CHECK-LABEL: @check2( +; CHECK: br i1 %cmp, +; CHECK: br i1 %cmp, +; Check that control-dependent-dce doesn't duplicate basic blocks +; including any instruction with noduplicate. +; check2 is the same as check1 except noduplicate. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %do.body + +do.body: ; preds = %do.body, %if.then + %i.0 = phi i32 [ 0, %if.then ], [ %inc, %do.body ] + call void () @show1() + %inc = add nsw i32 %i.0, 1 + %0 = load i32, i32* @E, align 4 + %cmp1 = icmp slt i32 %inc, %0 + br i1 %cmp1, label %do.body, label %do.end + +do.end: ; preds = %do.body + br label %if.end + +if.else: ; preds = %entry + call void () @show2() + br label %if.end + +if.end: ; preds = %if.else, %do.end + call void () @show3() noduplicate + br i1 %cmp, label %if.then3, label %if.else4 + +if.then3: ; preds = %if.end + call void () @show4() + br label %if.end5 + +if.else4: ; preds = %if.end + call void () @show5() + br label %if.end5 + +if.end5: ; preds = %if.else4, %if.then3 + ret void +} + +define void @check3(i32 %x, i32 %y) { +; CHECK-LABEL: @check3( +; CHECK: br i1 %cmp, +; CHECK-NOT: br i1 %cmp, +; Check that control-dependent-dce can handle multiple basic blocks +; (if.end and if.end11) with conditional branches using the value of %cmp. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %do.body + +do.body: ; preds = %do.body, %if.then + %i.0 = phi i32 [ 0, %if.then ], [ %inc, %do.body ] + call void () @show1() + %inc = add nsw i32 %i.0, 1 + %0 = load i32, i32* @E, align 4 + %cmp1 = icmp slt i32 %inc, %0 + br i1 %cmp1, label %do.body, label %do.end + +do.end: ; preds = %do.body + br label %if.end + +if.else: ; preds = %entry + call void () @show2() + br label %if.end + +if.end: ; preds = %if.else, %do.end + call void () @show3() + br i1 %cmp, label %if.then3, label %if.else10 + +if.then3: ; preds = %if.end + br label %do.body5 + +do.body5: ; preds = %do.body5, %if.then3 + %i4.0 = phi i32 [ 0, %if.then3 ], [ %inc6, %do.body5 ] + call void () @show4() + %inc6 = add nsw i32 %i4.0, 1 + %1 = load i32, i32* @E, align 4 + %cmp8 = icmp slt i32 %inc6, %1 + br i1 %cmp8, label %do.body5, label %do.end9 + +do.end9: ; preds = %do.body5 + br label %if.end11 + +if.else10: ; preds = %if.end + call void () @show5() + br label %if.end11 + +if.end11: ; preds = %if.else10, %do.end9 + br i1 %cmp, label %if.then13, label %if.else14 + +if.then13: ; preds = %if.end11 + call void () @show6() + br label %if.end15 + +if.else14: ; preds = %if.end11 + call void () @show7() + br label %if.end15 + +if.end15: ; preds = %if.else14, %if.then13 + ret void +} + +define void @check4(i32 %x, i32 %y) { +; CHECK-LABEL: @check4( +; CHECK: br i1 %cmp, +; CHECK: br i1 %cmp, +; CHECK-NOT: br i1 %cmp, +; Check that control-dependent-dce doesn't duplicate basic blocks +; including any instruction with noduplicate. +; check4 is the same as check3 except noduplicate. +entry: + %cmp = icmp slt i32 %x, %y + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %do.body + +do.body: ; preds = %do.body, %if.then + %i.0 = phi i32 [ 0, %if.then ], [ %inc, %do.body ] + call void () @show1() + %inc = add nsw i32 %i.0, 1 + %0 = load i32, i32* @E, align 4 + %cmp1 = icmp slt i32 %inc, %0 + br i1 %cmp1, label %do.body, label %do.end + +do.end: ; preds = %do.body + br label %if.end + +if.else: ; preds = %entry + call void () @show2() + br label %if.end + +if.end: ; preds = %if.else, %do.end + call void () @show3() noduplicate + br i1 %cmp, label %if.then3, label %if.else10 + +if.then3: ; preds = %if.end + br label %do.body5 + +do.body5: ; preds = %do.body5, %if.then3 + %i4.0 = phi i32 [ 0, %if.then3 ], [ %inc6, %do.body5 ] + call void () @show4() + %inc6 = add nsw i32 %i4.0, 1 + %1 = load i32, i32* @E, align 4 + %cmp8 = icmp slt i32 %inc6, %1 + br i1 %cmp8, label %do.body5, label %do.end9 + +do.end9: ; preds = %do.body5 + br label %if.end11 + +if.else10: ; preds = %if.end + call void () @show5() + br label %if.end11 + +if.end11: ; preds = %if.else10, %do.end9 + br i1 %cmp, label %if.then13, label %if.else14 + +if.then13: ; preds = %if.end11 + call void () @show6() + br label %if.end15 + +if.else14: ; preds = %if.end11 + call void () @show7() + br label %if.end15 + +if.end15: ; preds = %if.else14, %if.then13 + ret void +}