Index: llvm/include/llvm/Transforms/Utils/Local.h =================================================================== --- llvm/include/llvm/Transforms/Utils/Local.h +++ llvm/include/llvm/Transforms/Utils/Local.h @@ -16,6 +16,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" #include @@ -164,6 +165,26 @@ /// values, but instcombine orders them so it usually won't matter. bool EliminateDuplicatePHINodes(BasicBlock *BB); +/// Class to track cost of simplify CFG transformations. +class SimplifyCFGCostTracker { + /// Number of bonus instructions due to folding branches into predecessors. + /// E.g. folding + /// if (cond1) return false; + /// if (cond2) return false; + /// return true; + /// into + /// if (cond1 | cond2) return false; + /// return true; + /// In this case cond2 is always executed whereas originally it may be + /// evicted due to early exit of cond1. 'cond2' is called bonus instructions + /// and such bonus instructions could accumulate for unrolled loops, therefore + /// use a value map to accumulate their costs across transformations. + ValueMap NumBonusInsts; + +public: + void updateNumBonusInsts(BasicBlock *Parent, unsigned InstCount); + unsigned getNumBonusInsts(BasicBlock *Parent); +}; /// This function is used to do simplification of a CFG. For example, it /// adjusts branches to branches to eliminate the extra hop, it eliminates /// unreachable basic blocks, and does other peephole optimization of the CFG. @@ -174,7 +195,8 @@ bool simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, DomTreeUpdater *DTU = nullptr, const SimplifyCFGOptions &Options = {}, - ArrayRef LoopHeaders = {}); + ArrayRef LoopHeaders = {}, + SimplifyCFGCostTracker *CostTracker = nullptr); /// This function is used to flatten a CFG. For example, it uses parallel-and /// and parallel-or mode to collapse if-conditions and merge if-regions with @@ -184,7 +206,8 @@ /// If this basic block is ONLY a setcc and a branch, and if a predecessor /// branches to us and one of our successors, fold the setcc into the /// predecessor and use logical operations to pick the right destination. -bool FoldBranchToCommonDest(BranchInst *BI, llvm::DomTreeUpdater *DTU = nullptr, +bool FoldBranchToCommonDest(BranchInst *BI, SimplifyCFGCostTracker &CostTracker, + llvm::DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr, const TargetTransformInfo *TTI = nullptr, unsigned BonusInstThreshold = 1); Index: llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -221,7 +221,8 @@ /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, DomTreeUpdater *DTU, - const SimplifyCFGOptions &Options) { + const SimplifyCFGOptions &Options, + SimplifyCFGCostTracker &CostTracker) { bool Changed = false; bool LocalChange = true; @@ -252,7 +253,7 @@ while (BBIt != F.end() && DTU->isBBPendingDeletion(&*BBIt)) ++BBIt; } - if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) { + if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders, &CostTracker)) { LocalChange = true; ++NumSimpl; } @@ -266,11 +267,13 @@ DominatorTree *DT, const SimplifyCFGOptions &Options) { DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SimplifyCFGCostTracker CostTracker; bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr); EverChanged |= tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr); - EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged |= + iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options, CostTracker); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -284,7 +287,8 @@ return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options); + EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options, + CostTracker); EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr); } while (EverChanged); Index: llvm/lib/Transforms/Utils/LoopSimplify.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -480,6 +480,7 @@ DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA) { + SimplifyCFGCostTracker CostTracker; bool Changed = false; if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -666,7 +667,7 @@ // The block has now been cleared of all instructions except for // a comparison and a conditional branch. SimplifyCFG may be able // to fold it now. - if (!FoldBranchToCommonDest(BI, /*DTU=*/nullptr, MSSAU)) + if (!FoldBranchToCommonDest(BI, CostTracker, /*DTU=*/nullptr, MSSAU)) continue; // Success. The block is now dead, so remove it from the loop, Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -201,6 +201,26 @@ STATISTIC(NumInvokesMerged, "Number of invokes that were merged together"); STATISTIC(NumInvokeSetsFormed, "Number of invoke sets that were formed"); +namespace llvm { + +void SimplifyCFGCostTracker::updateNumBonusInsts(BasicBlock *BB, + unsigned InstCount) { + auto Loc = NumBonusInsts.find(BB); + if (Loc == NumBonusInsts.end()) { + NumBonusInsts.insert({BB, 0}); + Loc = NumBonusInsts.find(BB); + } + Loc->second = Loc->second + InstCount; +} +unsigned SimplifyCFGCostTracker::getNumBonusInsts(BasicBlock *BB) { + auto Loc = NumBonusInsts.find(BB); + if (Loc == NumBonusInsts.end()) + return 0; + return Loc->second; +} + +} // namespace llvm + namespace { // The first field contains the value that the switch produces when a certain @@ -237,6 +257,10 @@ ArrayRef LoopHeaders; const SimplifyCFGOptions &Options; bool Resimplify; + // Accumulates number of bonus instructions due to merging basic blocks + // of common destination. + SimplifyCFGCostTracker *CostTracker; + SimplifyCFGCostTracker LocalCostTracker; Value *isValueEqualityComparison(Instruction *TI); BasicBlock *GetValueEqualityComparisonCases( @@ -280,8 +304,10 @@ public: SimplifyCFGOpt(const TargetTransformInfo &TTI, DomTreeUpdater *DTU, const DataLayout &DL, ArrayRef LoopHeaders, - const SimplifyCFGOptions &Opts) - : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) { + const SimplifyCFGOptions &Opts, + SimplifyCFGCostTracker *CostTracker_) + : TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts), + CostTracker(CostTracker_ ? CostTracker_ : &LocalCostTracker) { assert((!DTU || !DTU->hasPostDomTree()) && "SimplifyCFG is not yet capable of maintaining validity of a " "PostDomTree, so don't ask for it."); @@ -3536,8 +3562,9 @@ /// If this basic block is simple enough, and if a predecessor branches to us /// and one of our successors, fold the block into the predecessor and use /// logical operations to pick the right destination. -bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU, - MemorySSAUpdater *MSSAU, +bool llvm::FoldBranchToCommonDest(BranchInst *BI, + SimplifyCFGCostTracker &CostTracker, + DomTreeUpdater *DTU, MemorySSAUpdater *MSSAU, const TargetTransformInfo *TTI, unsigned BonusInstThreshold) { // If this block ends with an unconditional branch, @@ -3609,7 +3636,6 @@ // as "bonus instructions", and only allow this transformation when the // number of the bonus instructions we'll need to create when cloning into // each predecessor does not exceed a certain threshold. - unsigned NumBonusInsts = 0; bool SawVectorOp = false; const unsigned PredCount = Preds.size(); for (Instruction &I : *BB) { @@ -3628,12 +3654,13 @@ // predecessor. Ignore free instructions. if (!TTI || TTI->getInstructionCost(&I, CostKind) != TargetTransformInfo::TCC_Free) { - NumBonusInsts += PredCount; - - // Early exits once we reach the limit. - if (NumBonusInsts > - BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) - return false; + for (auto PredBB : Preds) { + CostTracker.updateNumBonusInsts(PredBB, PredCount); + // Early exits once we reach the limit. + if (CostTracker.getNumBonusInsts(PredBB) > + BonusInstThreshold * BranchFoldToCommonDestVectorMultiplier) + return false; + } } auto IsBCSSAUse = [BB, &I](Use &U) { @@ -3647,10 +3674,12 @@ if (!all_of(I.uses(), IsBCSSAUse)) return false; } - if (NumBonusInsts > - BonusInstThreshold * - (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) - return false; + for (auto PredBB : Preds) { + if (CostTracker.getNumBonusInsts(PredBB) > + BonusInstThreshold * + (SawVectorOp ? BranchFoldToCommonDestVectorMultiplier : 1)) + return false; + } // Ok, we have the budget. Perform the transformation. for (BasicBlock *PredBlock : Preds) { @@ -6796,7 +6825,7 @@ // branches to us and our successor, fold the comparison into the // predecessor and use logical operations to update the incoming value // for PHI nodes in common successor. - if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI, + if (FoldBranchToCommonDest(BI, *CostTracker, DTU, /*MSSAU=*/nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); return false; @@ -6865,7 +6894,7 @@ // If this basic block is ONLY a compare and a branch, and if a predecessor // branches to us and one of our successors, fold the comparison into the // predecessor and use logical operations to pick the right destination. - if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI, + if (FoldBranchToCommonDest(BI, *CostTracker, DTU, /*MSSAU=*/nullptr, &TTI, Options.BonusInstThreshold)) return requestResimplify(); @@ -7164,8 +7193,9 @@ bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, DomTreeUpdater *DTU, const SimplifyCFGOptions &Options, - ArrayRef LoopHeaders) { + ArrayRef LoopHeaders, + SimplifyCFGCostTracker *NumBonusInsts) { return SimplifyCFGOpt(TTI, DTU, BB->getModule()->getDataLayout(), LoopHeaders, - Options) + Options, NumBonusInsts) .run(BB); } Index: llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll +++ llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -passes='require,loop-unroll,simplifycfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s +; RUN: opt < %s -S -passes='require,loop-unroll,simplifycfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s define void @basic(i32 %K, i32 %N) { ; CHECK-LABEL: @basic( Index: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -O2 -S < %s | FileCheck %s +; RUN: opt -O2 -bonus-inst-threshold=3 -S < %s | FileCheck %s target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--" Index: llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll +++ llvm/test/Transforms/SimplifyCFG/branch-fold-threshold.ll @@ -1,9 +1,9 @@ ; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=NORMAL -; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=2 | FileCheck %s --check-prefix=AGGRESSIVE -; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=4 | FileCheck %s --check-prefix=WAYAGGRESSIVE +; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=3 | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -bonus-inst-threshold=6 | FileCheck %s --check-prefix=WAYAGGRESSIVE ; RUN: opt %s -passes=simplifycfg -S | FileCheck %s --check-prefix=NORMAL -; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=AGGRESSIVE -; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=WAYAGGRESSIVE +; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=AGGRESSIVE +; RUN: opt %s -passes='simplifycfg' -S | FileCheck %s --check-prefix=WAYAGGRESSIVE define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input) { ; NORMAL-LABEL: @foo( Index: llvm/test/Transforms/SimplifyCFG/branch-fold-unrolled-loop.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SimplifyCFG/branch-fold-unrolled-loop.ll @@ -0,0 +1,64 @@ +; RUN: opt %s -S -o - -simplifycfg | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +%struct.S = type { [4 x i32] } + +; CHECK-LABEL: define {{.*}}@test( +; CHECK-LABEL: entry: +; CHECK: %arrayidx = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 0 +; CHECK-NEXT: %0 = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: %cmp2 = icmp sgt i32 %0, 0 +; CHECK-NEXT: %arrayidx.1 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 1 +; CHECK-NEXT: %1 = load i32, i32* %arrayidx.1, align 4 +; CHECK-NEXT: %cmp2.1 = icmp sgt i32 %1, 0 +; CHECK-NEXT: %or.cond = select i1 %cmp2, i1 true, i1 %cmp2.1 +; CHECK-NEXT: br i1 %or.cond, label %cleanup, label %for.cond.1 +; CHECK-LABEL: for.cond.1: +; CHECK: %arrayidx.2 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 2 +; CHECK-NEXT: %2 = load i32, i32* %arrayidx.2, align 4 +; CHECK-NEXT: %cmp2.2 = icmp sgt i32 %2, 0 +; CHECK-NEXT: %arrayidx.3 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 3 +; CHECK-NEXT: %3 = load i32, i32* %arrayidx.3, align 4 +; CHECK-NEXT: %cmp2.3 = icmp sgt i32 %3, 0 +; CHECK-NEXT: %or.cond1 = select i1 %cmp2.2, i1 true, i1 %cmp2.3 +; CHECK-NEXT: %spec.select = select i1 %or.cond1, i1 false, i1 true +; CHECK-NEXT: br label %cleanup +; CHECK-LABEL: cleanup: +; CHECK: %cmp = phi i1 [ false, %entry ], [ %spec.select, %for.cond.1 ] +; CHECK-NEXT: ret i1 %cmp + +define zeroext i1 @test(%struct.S* nocapture noundef nonnull readonly align 4 dereferenceable(16) %this) unnamed_addr #0 align 2 { +entry: + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 0 + %0 = load i32, i32* %arrayidx, align 4 + %cmp2 = icmp sgt i32 %0, 0 + br i1 %cmp2, label %cleanup, label %for.cond + +for.cond: + %arrayidx.1 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 1 + %1 = load i32, i32* %arrayidx.1, align 4 + %cmp2.1 = icmp sgt i32 %1, 0 + br i1 %cmp2.1, label %cleanup, label %for.cond.1 + +for.cond.1: + %arrayidx.2 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 2 + %2 = load i32, i32* %arrayidx.2, align 4 + %cmp2.2 = icmp sgt i32 %2, 0 + br i1 %cmp2.2, label %cleanup, label %for.cond.2 + +for.cond.2: + %arrayidx.3 = getelementptr inbounds %struct.S, %struct.S* %this, i64 0, i32 0, i64 3 + %3 = load i32, i32* %arrayidx.3, align 4 + %cmp2.3 = icmp sgt i32 %3, 0 + br i1 %cmp2.3, label %cleanup, label %for.cond.3 + +for.cond.3: + br label %cleanup + +cleanup: + %cmp = phi i1 [ false, %entry ], [ false, %for.cond ], [ false, %for.cond.1 ], [ false, %for.cond.2 ], [ true, %for.cond.3 ] + ret i1 %cmp +} + +attributes #0 = { "target-cpu"="gfx906"} Index: llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll =================================================================== --- llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll +++ llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=ALL,THR1 %s -; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=ALL,THR2 %s +; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=3 | FileCheck --check-prefixes=ALL,THR2 %s declare void @sideeffect0() declare void @sideeffect1() @@ -10,7 +10,7 @@ ; Here we'd want to duplicate %v3_adj into two predecessors, ; but -bonus-inst-threshold=1 says that we can only clone it into one. -; With -bonus-inst-threshold=2 we can clone it into both though. +; With -bonus-inst-threshold=3 we can clone it into both though. define void @two_preds_with_extra_op(i8 %v0, i8 %v1, i8 %v2, i8 %v3) { ; THR1-LABEL: @two_preds_with_extra_op( ; THR1-NEXT: entry: