Index: llvm/include/llvm/Transforms/Utils/Local.h =================================================================== --- llvm/include/llvm/Transforms/Utils/Local.h +++ llvm/include/llvm/Transforms/Utils/Local.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" #include @@ -164,6 +165,26 @@ /// values, but instcombine orders them so it usually won't matter. bool EliminateDuplicatePHINodes(BasicBlock *BB); +/// Class to track cost of simplify CFG transformations. +class SimplifyCFGCostTracker { + /// Number of bonus instructions due to folding branches into predecessors. + /// E.g. folding + /// if (cond1) return false; + /// if (cond2) return false; + /// return true; + /// into + /// if (cond1 | cond2) return false; + /// return true; + /// In this case cond2 is always executed whereas originally it may be + /// evicted due to early exit of cond1. 'cond2' is called bonus instructions + /// and such bonus instructions could accumulate for unrolled loops, therefore + /// use a value map to accumulate their costs across transformations. + ValueMap NumBonusInsts; + +public: + void updateNumBonusInsts(BasicBlock *Parent, unsigned InstCount); + unsigned getNumBonusInsts(BasicBlock *Parent); +}; /// This function is used to do simplification of a CFG. For example, it /// adjusts branches to branches to eliminate the extra hop, it eliminates /// unreachable basic blocks, and does other peephole optimization of the CFG. @@ -174,7 +195,8 @@ bool simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, DomTreeUpdater *DTU = nullptr, const SimplifyCFGOptions &Options = {}, - ArrayRef LoopHeaders = {}); + ArrayRef LoopHeaders = {}, + SimplifyCFGCostTracker *CostTracker = nullptr); /// This function is used to flatten a CFG. For example, it uses parallel-and /// and parallel-or mode to collapse if-conditions and merge if-regions with @@ -184,7 +206,8 @@ /// If this basic block is ONLY a setcc and a branch, and if a predecessor /// branches to us and one of our successors, fold the setcc into the /// predecessor and use logical operations to pick the right destination. -bool FoldBranchToCommonDest(BranchInst *BI, llvm::DomTreeUpdater *DTU = nullptr, +bool FoldBranchToCommonDest(BranchInst *BI, SimplifyCFGCostTracker &CostTracker, + llvm::DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr, const TargetTransformInfo *TTI = nullptr, unsigned BonusInstThreshold = 1); Index: llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -221,7 +221,8 @@ /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, DomTreeUpdater *DTU, - const SimplifyCFGOptions &Options) { + const SimplifyCFGOptions &Options, + SimplifyCFGCostTracker &CostTracker) { bool Changed = false; bool LocalChange = true; @@ -252,7 +253,7 @@ while (BBIt != F.end() && DTU->isBBPendingDeletion(&*BBIt)) ++BBIt; } - if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) { + if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders, &CostTracker)) { LocalChange = true; ++NumSimpl; } @@ -266,11 +267,13 @@ DominatorTree *DT, const SimplifyCFGOptions &Options) { DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SimplifyCFGCostTracker CostTracker; bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr); EverChanged |= tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr); - EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged |= + iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options, CostTracker); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -284,7 +287,8 @@ return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options, + CostTracker); EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr); } while (EverChanged); Index: llvm/lib/Transforms/Utils/LoopSimplify.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -480,6 +480,7 @@ DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { + SimplifyCFGCostTracker CostTracker; bool Changed = false; if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -666,7 +667,7 @@ // The block has now been cleared of all instructions except for // a comparison and a conditional branch. SimplifyCFG may be able // to fold it now. - if (!FoldBranchToCommonDest(BI, /*DTU=*/nullptr, MSSAU)) + if (!FoldBranchToCommonDest(BI, CostTracker, /*DTU=*/nullptr, MSSAU)) continue; // Success. The block is now dead, so remove it from the loop, Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -201,6 +201,21 @@ STATISTIC(NumInvokesMerged, "Number of invokes that were merged together"); STATISTIC(NumInvokeSetsFormed, "Number of invoke sets that were formed"); +namespace llvm { + +void SimplifyCFGCostTracker::updateNumBonusInsts(BasicBlock *BB, + unsigned InstCount) { + auto Loc = NumBonusInsts.find(BB); + if (Loc == NumBonusInsts.end()) + Loc = NumBonusInsts.insert({BB, 0}).first; + Loc->second = Loc->second + InstCount; +} +unsigned SimplifyCFGCostTracker::getNumBonusInsts(BasicBlock *BB) { + return NumBonusInsts.lookup(BB); +} + +} // namespace llvm + namespace { // The first field contains the value that the switch produces when a certain @@ -237,6 +252,10 @@ ArrayRef LoopHeaders; const SimplifyCFGOptions &Options; bool Resimplify; + // Accumulates number of bonus instructions due to merging basic blocks + // of common destination. + SimplifyCFGCostTracker *CostTracker; + SimplifyCFGCostTracker LocalCostTracker; Value *isValueEqualityComparison(Instruction *TI); BasicBlock *GetValueEqualityComparisonCases( @@ -280,8 +299,10 @@ public: SimplifyCFGOpt(const TargetTransformInfo &TTI, DomTreeUpdater *DTU, const DataLayout &DL, ArrayRef LoopHeaders, - const SimplifyCFGOptions &Opts) - : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) { + const SimplifyCFGOptions &Opts, + SimplifyCFGCostTracker *CostTracker_) + : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts), + CostTracker(CostTracker_ ? CostTracker_ : &LocalCostTracker) { assert((!DTU || !DTU->hasPostDomTree()) && "SimplifyCFG is not yet capable of maintaining validity of a " "PostDomTree, so don't ask for it."); @@ -3536,8 +3557,9 @@ /// If this basic block is simple enough, and if a predecessor branches to us /// and one of our successors, fold the block into the predecessor and use /// logical operations to pick the right destination. -bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, - MemorySSAUpdater *MSSAU, +bool llvm::FoldBranchToCommonDest(BranchInst *BI, + SimplifyCFGCostTracker &CostTracker, + DomTreeUpdater *DTU, MemorySSAUpdater *MSSAU, const TargetTransformInfo *TTI, unsigned BonusInstThreshold) { // If this block ends with an unconditional branch, @@ -3609,7 +3631,6 @@ // as "bonus instructions", and only allow this transformation when the // number of the bonus instructions we'll need to create when cloning into // each predecessor does not exceed a certain threshold. - unsigned NumBonusInsts = 0; bool SawVectorOp = false; const unsigned PredCount = Preds.size(); for (Instruction &I : *BB) { @@ -3628,12 +3649,13 @@ // predecessor. Ignore free instructions. if (!TTI || TTI->getInstructionCost(&I, CostKind) != TargetTransformInfo::TCC_Free) { - NumBonusInsts += PredCount; - - // Early exits once we reach the limit. - if (NumBonusInsts > - BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) - return false; + for (auto PredBB : Preds) { + CostTracker.updateNumBonusInsts(PredBB, PredCount); + // Early exits once we reach the limit. + if (CostTracker.getNumBonusInsts(PredBB) > + BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) + return false; + } } auto IsBCSSAUse = [BB, &I](Use &U) { @@ -3647,10 +3669,12 @@ if (!all_of(I.uses(), IsBCSSAUse)) return false; } - if (NumBonusInsts > - BonusInstThreshold * - (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) - return false; + for (auto PredBB : Preds) { + if (CostTracker.getNumBonusInsts(PredBB) > + BonusInstThreshold * + (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) + return false; + } // Ok, we have the budget. Perform the transformation. for (BasicBlock *PredBlock : Preds) { @@ -6796,7 +6820,7 @@ // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI, + if (FoldBranchToCommonDest(BI, *CostTracker, DTU, /*MSSAU=*/nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); return false; @@ -6865,7 +6889,7 @@ // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI, + if (FoldBranchToCommonDest(BI, *CostTracker, DTU, /*MSSAU=*/nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); @@ -7164,8 +7188,9 @@ bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, DomTreeUpdater *DTU, const SimplifyCFGOptions &Options, - ArrayRef LoopHeaders) { + ArrayRef LoopHeaders, + SimplifyCFGCostTracker *CostTracker) { return SimplifyCFGOpt(TTI, DTU, BB->getModule()->getDataLayout(), LoopHeaders, - Options) + Options, CostTracker) .run(BB); } Index: llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll +++ llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes='require,loop-unroll,simplifycfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s +; RUN: opt < %s -S -passes='require,loop-unroll,simplifycfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s define void @basic(i32 %K, i32 %N) { ; CHECK-LABEL: @basic( Index: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -O2 -S < %s | FileCheck %s +; RUN: opt -bonus-inst-threshold=4 -O2 -S < %s | FileCheck %s target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--" Index: llvm/test/Transforms/SimplifyCFG/branch-fold-multiple.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/branch-fold-multiple.ll +++ llvm/test/Transforms/SimplifyCFG/branch-fold-multiple.ll @@ -6,9 +6,12 @@ %struct.S = type { [4 x i32] } -; Check the second, third, and fourth basic blocks are folded into -; the first basic block since each has one bonus intruction, which -; is below the default bouns instruction threshold 2. +; Check the second basic block is folded into the first basic block +; since it has one bonus intruction. The third basic block is not +; folded into the first basic block since the accumulated bonus +; instructions will exceed default threshold 2. The fourth basic +; block is foled into the third basic block since the accumulated +; bonus instruction cost is 1. define zeroext i1 @test1(%struct.S* nocapture noundef nonnull readonly align 4 dereferenceable(16) %this) unnamed_addr #0 align 2 { ; CHECK-LABEL: @test1( @@ -20,16 +23,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[CMP2_1:%.*]] = icmp sgt i32 [[TMP1]], 0 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP2]], i1 true, i1 [[CMP2_1]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[FOR_COND_1:%.*]] +; CHECK: for.cond.1: ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[THIS]], i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[CMP2_2:%.*]] = icmp sgt i32 [[TMP2]], 0 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[CMP2_2]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[THIS]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[CMP2_3:%.*]] = icmp sgt i32 [[TMP3]], 0 -; CHECK-NEXT: [[OR_COND2:%.*]] = select i1 [[OR_COND1]], i1 true, i1 [[CMP2_3]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[OR_COND2]], i1 false, i1 true -; CHECK-NEXT: ret i1 [[SPEC_SELECT]] +; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[CMP2_2]], i1 true, i1 [[CMP2_3]] +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[OR_COND1]], i1 false, i1 true +; CHECK-NEXT: br label [[CLEANUP]] +; CHECK: cleanup: +; CHECK-NEXT: [[CMP:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[FOR_COND_1]] ] +; CHECK-NEXT: ret i1 [[CMP]] ; entry: %arrayidx = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 0 Index: llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll +++ llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll @@ -1,9 +1,9 @@ ; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=NORMAL -; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=2 | FileCheck %s --check-prefix=AGGRESSIVE -; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=4 | FileCheck %s --check-prefix=WAYAGGRESSIVE +; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=3 | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=6 | FileCheck %s --check-prefix=WAYAGGRESSIVE ; RUN: opt %s -passes=simplifycfg -S | FileCheck %s --check-prefix=NORMAL -; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=AGGRESSIVE -; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=WAYAGGRESSIVE +; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=WAYAGGRESSIVE define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input) { ; NORMAL-LABEL: @foo( Index: llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll +++ llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=ALL,THR1 %s -; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=ALL,THR2 %s +; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=3 | FileCheck --check-prefixes=ALL,THR2 %s declare void @sideeffect0() declare void @sideeffect1() @@ -10,7 +10,7 @@ ; Here we'd want to duplicate %v3_adj into two predecessors, ; but -bonus-inst-threshold=1 says that we can only clone it into one. -; With -bonus-inst-threshold=2 we can clone it into both though. +; With -bonus-inst-threshold=3 we can clone it into both though. define void @two_preds_with_extra_op(i8 %v0, i8 %v1, i8 %v2, i8 %v3) { ; THR1-LABEL: @two_preds_with_extra_op( ; THR1-NEXT: entry: