Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -142,6 +142,7 @@ void initializeForceFunctionAttrsLegacyPassPass(PassRegistry&); void initializeGCMachineCodeAnalysisPass(PassRegistry&); void initializeGCModuleInfoPass(PassRegistry&); +void initializeGuardWideningLegacyPassPass(PassRegistry&); void initializeGVNLegacyPassPass(PassRegistry&); void initializeGlobalDCELegacyPassPass(PassRegistry&); void initializeGlobalOptLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -99,6 +99,7 @@ (void) llvm::createGlobalDCEPass(); (void) llvm::createGlobalOptimizerPass(); (void) llvm::createGlobalsAAWrapperPass(); + (void) llvm::createGuardWideningPass(); (void) llvm::createIPConstantPropagationPass(); (void) llvm::createIPSCCPPass(); (void) llvm::createInductiveRangeCheckEliminationPass(); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -81,6 +81,16 @@ // FunctionPass *createAggressiveDCEPass(); + +//===----------------------------------------------------------------------===// +// +// GuardWidening - An optimization over the @llvm.experimental.guard intrinsic +// that (optimistically) combines multiple guards into one to have fewer checks +// at runtime. +// +FunctionPass *createGuardWideningPass(); + + //===----------------------------------------------------------------------===// // // BitTrackingDCE - This pass uses a bit-tracking DCE algorithm in order to Index: include/llvm/Transforms/Scalar/GuardWidening.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Scalar/GuardWidening.h @@ -0,0 +1,32 @@ +//===- GuardWidening.h - ----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Guard widening is an optimization over the @llvm.experimental.guard intrinsic +// that (optimistically) combines multiple guards into one to have fewer checks +// at runtime. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H +#define LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Function; + +struct GuardWideningPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, AnalysisManager &AM); +}; +} + + +#endif // LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -66,6 +66,7 @@ #include "llvm/Transforms/Scalar/DCE.h" #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/GuardWidening.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/LoopRotation.h" #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -118,6 +118,7 @@ FUNCTION_PASS("no-op-function", NoOpFunctionPass()) FUNCTION_PASS("loweratomic", LowerAtomicPass()) FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass()) +FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn", GVN()) FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -10,6 +10,7 @@ EarlyCSE.cpp FlattenCFGPass.cpp Float2Int.cpp + GuardWidening.cpp GVN.cpp InductiveRangeCheckElimination.cpp IndVarSimplify.cpp Index: lib/Transforms/Scalar/GuardWidening.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/GuardWidening.cpp @@ -0,0 +1,427 @@ +//===- GuardWidening.cpp - ---- Guard widening ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the guard widening pass. The semantics of the +// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails +// more often that it did before the transform. This optimization is called +// "widening" and can be used hoist and common runtime checks in situations like +// these: +// +// %cmp0 = 7 u< Length +// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] +// call @unknown_side_effects() +// %cmp1 = 9 u< Length +// call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ] +// ... +// +// => +// +// %cmp0 = 9 u< Length +// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] +// call @unknown_side_effects() +// ... +// +// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a +// generic implementation of the same function, which will have the correct +// semantics from that point onward. It is always _legal_ to deoptimize (so +// replacing %cmp0 with false is "correct"), though it may not always be +// profitable to do so. +// +// NB! This pass is a work in progress. It hasn't been tuned to be "production +// ready" yet. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/GuardWidening.h" +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +#define DEBUG_TYPE "guard-widening" + +namespace { + +class GuardWideningImpl { + DominatorTree &DT; + PostDominatorTree &PDT; + LoopInfo &LI; + + /// The set of guards whose conditions have been widened into dominating + /// guards. + SmallVector EliminatedGuards; + + /// The set of guards which have been widened to include conditions to other + /// guards. + DenseSet WidenedGuards; + + /// Try to eliminate guard \p Guard by widening it into an earlier dominating + /// guard. \p DFSI is the DFS iterator on the dominator tree that is + /// currently visiting the block containing \p Guard, and \p GuardsPerBlock + /// maps BasicBlocks to the set of guards seen in that block. + bool eliminateGuardViaWidening( + IntrinsicInst *Guard, const df_iterator &DFSI, + const DenseMap> & + GuardsPerBlock); + + /// Used to keep track of which widening potential is more effective. + enum WideningScore { + /// Don't widen. + WS_IllegalOrNegative, + + /// Widening is performance neutral as far as the cycles spent in check + /// conditions goes (but can still help, e.g., code layout, having less + /// deopt state). + WS_Neutral, + + /// Widening is profitable. + WS_Positive, + + /// Widening is very profitable. Not significantly different from \c + /// WS_Positive, except by the order. + WS_VeryPositive + }; + + static StringRef scoreTypeToString(WideningScore WS); + + /// Compute the score for widening the condition in \p DominatedGuard + /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in + /// \p DominatingGuardLoop). + WideningScore computeWideningScore(IntrinsicInst *DominatedGuard, + Loop *DominatedGuardLoop, + IntrinsicInst *DominatingGuard, + Loop *DominatingGuardLoop); + + /// Helper to check if \p V can be hoisted to \p InsertPos. + bool isAvailableAt(Value *V, Instruction *InsertPos) { + SmallPtrSet Visited; + return isAvailableAt(V, InsertPos, Visited); + } + + bool isAvailableAt(Value *V, Instruction *InsertPos, + SmallPtrSetImpl &Visited); + + /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c + /// isAvailableAt returned true. + void makeAvailableAt(Value *V, Instruction *InsertPos); + + /// Common helper used by \c widenGuard and \c isWideningCondProfitable. Try + /// to generate an expression computing the logical AND of \p Cond0 and \p + /// Cond1. Return true if the expression computing the AND is only as + /// expensive as computing one of the two. If \p InsertPt is true then + /// actually generate the resulting expression, make it available at \p + /// InsertPt and return it in \p Result (else no change to the IR is made). + bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt, + Value *&Result); + + /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of + /// computing only one of the two expressions? + bool isWideningCondProfitable(Value *Cond0, Value *Cond1) { + Value *ResultUnused; + return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused); + } + + /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to + /// whatever it is already checking). + void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) { + Value *Result; + widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result); + ToWiden->setArgOperand(0, Result); + } + +public: + explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT, + LoopInfo &LI) + : DT(DT), PDT(PDT), LI(LI) {} + + /// The entry point for this pass. + bool run(); +}; + +struct GuardWideningLegacyPass : public FunctionPass { + static char ID; + GuardWideningPass Impl; + + GuardWideningLegacyPass() : FunctionPass(ID) { + initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + return GuardWideningImpl( + getAnalysis().getDomTree(), + getAnalysis().getPostDomTree(), + getAnalysis().getLoopInfo()).run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } +}; + +} + +bool GuardWideningImpl::run() { + using namespace llvm::PatternMatch; + + DenseMap> GuardsInBlock; + bool Changed = false; + + for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode()); + DFI != DFE; ++DFI) { + auto *BB = (*DFI)->getBlock(); + auto &CurrentList = GuardsInBlock[BB]; + + for (auto &I : *BB) + if (match(&I, m_Intrinsic())) + CurrentList.push_back(cast(&I)); + + for (auto *II : CurrentList) + Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock); + } + + for (auto *II : EliminatedGuards) + if (!WidenedGuards.count(II)) + II->eraseFromParent(); + + return Changed; +} + +bool GuardWideningImpl::eliminateGuardViaWidening( + IntrinsicInst *GuardInst, const df_iterator &DFSI, + const DenseMap> & + GuardsInBlock) { + IntrinsicInst *BestSoFar = nullptr; + auto BestScoreSoFar = WS_IllegalOrNegative; + auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent()); + + // In the set of dominating guards, find the one we can merge GuardInst with + // for the most profit. + for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) { + auto *CurBB = DFSI.getPath(i)->getBlock(); + auto *CurLoop = LI.getLoopFor(CurBB); + assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!"); + const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second; + + auto I = GuardsInCurBB.begin(); + auto E = GuardsInCurBB.end(); + +#ifndef NDEBUG + { + unsigned Index = 0; + for (auto &I : *CurBB) { + if (Index == GuardsInCurBB.size()) + break; + if (GuardsInCurBB[Index] == &I) + Index++; + } + assert(Index == GuardsInCurBB.size() && + "Guards expected to be in order!"); + } +#endif + + assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?"); + + if (i == (e - 1)) { + // Corner case: make sure we're only looking at guards strictly dominating + // GuardInst when visiting GuardInst->getParent(). + auto NewEnd = std::find(I, E, GuardInst); + assert(NewEnd != E && "GuardInst not in its own block?"); + E = NewEnd; + } + + for (auto *Candidate : make_range(I, E)) { + auto Score = + computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop); + DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0) + << " and " << *Candidate->getArgOperand(0) << " is " + << scoreTypeToString(Score) << "\n"); + if (Score > BestScoreSoFar) { + BestScoreSoFar = Score; + BestSoFar = Candidate; + } + } + } + + if (BestScoreSoFar == WS_IllegalOrNegative) { + DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n"); + return false; + } + + assert(BestSoFar != GuardInst && "Should have never visited same guard!"); + assert(DT.dominates(BestSoFar, GuardInst) && "Should be!"); + + DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar + << " with score " << scoreTypeToString(BestScoreSoFar) << "\n"); + widenGuard(BestSoFar, GuardInst->getArgOperand(0)); + GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext())); + EliminatedGuards.push_back(GuardInst); + WidenedGuards.insert(BestSoFar); + return true; +} + +GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( + IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop, + IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) { + bool HoistingOutOfLoop = false; + + if (DominatingGuardLoop != DominatedGuardLoop) { + if (DominatingGuardLoop && + !DominatingGuardLoop->contains(DominatedGuardLoop)) + return WS_IllegalOrNegative; + + HoistingOutOfLoop = true; + } + + if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard)) + return WS_IllegalOrNegative; + + bool HoistingOutOfIf = + !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent()); + + if (isWideningCondProfitable(DominatedGuard->getArgOperand(0), + DominatingGuard->getArgOperand(0))) + return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive; + + if (HoistingOutOfLoop) + return WS_Positive; + + return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral; +} + +bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc, + SmallPtrSetImpl &Visited) { + auto *Inst = dyn_cast(V); + if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst)) + return true; + + if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) || + Inst->mayReadFromMemory()) + return false; + + Visited.insert(Inst); + + // We only want to go _up_ the dominance chain when recursing. + assert(!isa(Loc) && + "PHIs should return false for isSafeToSpeculativelyExecute"); + assert(DT.isReachableFromEntry(Inst->getParent()) && + "We did a DFS from the block entry!"); + return all_of(Inst->operands(), + [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); }); +} + +void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) { + auto *Inst = dyn_cast(V); + if (!Inst || DT.dominates(Inst, Loc)) + return; + + assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) && + !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!"); + + for (Value *Op : Inst->operands()) + makeAvailableAt(Op, Loc); + + Inst->moveBefore(Loc); +} + +bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, + Instruction *InsertPt, Value *&Result) { + using namespace llvm::PatternMatch; + + { + // L >u C0 && L >u C1 -> L >u max(C0, C1) + ConstantInt *RHS0, *RHS1; + Value *LHS; + ICmpInst::Predicate Pred0, Pred1; + if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) && + match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) { + + // TODO: This logic should be generalized and refactored into a new + // Constant::getEquivalentICmp helper. + if (Pred0 == ICmpInst::ICMP_NE && RHS0->isZero()) + Pred0 = ICmpInst::ICMP_UGT; + if (Pred1 == ICmpInst::ICMP_NE && RHS1->isZero()) + Pred1 = ICmpInst::ICMP_UGT; + + if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_UGT) { + if (InsertPt) { + ConstantInt *NewRHS = + RHS0->getValue().ugt(RHS1->getValue()) ? RHS0 : RHS1; + Result = new ICmpInst(InsertPt, ICmpInst::ICMP_UGT, LHS, NewRHS, + "wide.chk"); + } + + return true; + } + } + } + + // Base case -- just logical-and the two conditions together. + + if (InsertPt) { + makeAvailableAt(Cond0, InsertPt); + makeAvailableAt(Cond1, InsertPt); + + Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt); + } + + // We were not able to compute Cond0 AND Cond1 for the price of one. + return false; +} + +PreservedAnalyses GuardWideningPass::run(Function &F, + AnalysisManager &AM) { + auto &DT = AM.getResult(F); + auto &LI = AM.getResult(F); + auto &PDT = AM.getResult(F); + bool Changed = GuardWideningImpl(DT, PDT, LI).run(); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { + switch (WS) { + case WS_IllegalOrNegative: + return "IllegalOrNegative"; + case WS_Neutral: + return "Neutral"; + case WS_Positive: + return "Positive"; + case WS_VeryPositive: + return "VeryPositive"; + } + + llvm_unreachable("Fully covered switch above!"); +} + +char GuardWideningLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", + false, false) + +FunctionPass *llvm::createGuardWideningPass() { + return new GuardWideningLegacyPass(); +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -41,6 +41,7 @@ initializeDeadInstEliminationPass(Registry); initializeScalarizerPass(Registry); initializeDSELegacyPassPass(Registry); + initializeGuardWideningLegacyPassPass(Registry); initializeGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); Index: test/Transforms/GuardWidening/basic.ll =================================================================== --- /dev/null +++ test/Transforms/GuardWidening/basic.ll @@ -0,0 +1,337 @@ +; RUN: opt -S -guard-widening < %s | FileCheck %s +; RUN: opt -S -passes=guard-widening < %s | FileCheck %s + +declare void @llvm.experimental.guard(i1,...) + +; Basic test case: we wide the first check to check both the +; conditions. +define void @f_0(i1 %cond_0, i1 %cond_1) { +; CHECK-LABEL: @f_0( +entry: +; CHECK: %wide.chk = and i1 %cond_0, %cond_1 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: ret void + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void +} + +; Same as @f_0, but with using a more general notion of postdominance. +define void @f_1(i1 %cond_0, i1 %cond_1) { +; CHECK-LABEL: @f_1( +entry: +; CHECK: %wide.chk = and i1 %cond_0, %cond_1 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %left, label %right + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %left, label %right + +left: + br label %merge + +right: + br label %merge + +merge: +; CHECK: merge: +; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard( +; CHECK: ret void + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void +} + +; Like @f_1, but we have some code we need to hoist before we can +; widen a dominanting check. +define void @f_2(i32 %a, i32 %b) { +; CHECK-LABEL: @f_2( +entry: +; CHECK: %cond_0 = icmp ult i32 %a, 10 +; CHECK: %cond_1 = icmp ult i32 %b, 10 +; CHECK: %wide.chk = and i1 %cond_0, %cond_1 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %left, label %right + + %cond_0 = icmp ult i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %left, label %right + +left: + br label %merge + +right: + br label %merge + +merge: + %cond_1 = icmp ult i32 %b, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void +} + +; Negative test: don't hoist stuff out of control flow +; indiscriminately, since that can make us do more work than needed. +define void @f_3(i32 %a, i32 %b) { +; CHECK-LABEL: @f_3( +entry: +; CHECK: %cond_0 = icmp ult i32 %a, 10 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] +; CHECK: br i1 undef, label %left, label %right + + %cond_0 = icmp ult i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %left, label %right + +left: +; CHECK: left: +; CHECK: %cond_1 = icmp ult i32 %b, 10 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] +; CHECK: ret void + + %cond_1 = icmp ult i32 %b, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void + +right: + ret void +} + +; But hoisting out of control flow is fine if it makes a loop computed +; condition loop invariant. This behavior may require some tuning in +; the future. +define void @f_4(i32 %a, i32 %b) { +; CHECK-LABEL: @f_4( +entry: +; CHECK: %cond_0 = icmp ult i32 %a, 10 +; CHECK: %cond_1 = icmp ult i32 %b, 10 +; CHECK: %wide.chk = and i1 %cond_0, %cond_1 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %loop, label %leave + + %cond_0 = icmp ult i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %loop, label %leave + +loop: + %cond_1 = icmp ult i32 %b, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + br i1 undef, label %loop, label %leave + +leave: + ret void +} + +; Hoisting out of control flow is also fine if we can widen the +; dominating check without doing any extra work. +define void @f_5(i32 %a) { +; CHECK-LABEL: @f_5( +entry: +; CHECK: %wide.chk = icmp ugt i32 %a, 10 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %left, label %right + + %cond_0 = icmp ugt i32 %a, 7 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %left, label %right + +left: + %cond_1 = icmp ugt i32 %a, 10 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void + +right: + ret void +} + +; Negative test: the load from %a can be safely speculated to before +; the first guard, but there is no guarantee that it will produce the +; same value. +define void @f_6(i1* dereferenceable(32) %a, i1* %b, i1 %unknown) { +; CHECK-LABEL: @f_6( +; CHECK: call void (i1, ...) @llvm.experimental.guard( +; CHECK: call void (i1, ...) @llvm.experimental.guard( +; CHECK: ret void +entry: + %cond_0 = load i1, i1* %a + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + store i1 %unknown, i1* %b + %cond_1 = load i1, i1* %a + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void +} + +; All else equal, we try to widen the earliest guard we can. This +; heuristic can use some tuning. +define void @f_7(i32 %a, i1* %cond_buf) { +; CHECK-LABEL: @f_7( +entry: +; CHECK: %cond_1 = load volatile i1, i1* %cond_buf +; CHECK: %cond_3 = icmp ult i32 %a, 7 +; CHECK: %wide.chk = and i1 %cond_1, %cond_3 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: %cond_2 = load volatile i1, i1* %cond_buf +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ] +; CHECK: br i1 undef, label %left, label %right + + %cond_1 = load volatile i1, i1* %cond_buf + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + %cond_2 = load volatile i1, i1* %cond_buf + call void(i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ] + br i1 undef, label %left, label %right + +left: + %cond_3 = icmp ult i32 %a, 7 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_3) [ "deopt"() ] + br label %left + +right: + ret void +} + +; In this case the earliest dominating guard is in a loop, and we +; don't want to put extra work in there. This heuristic can use some +; tuning. +define void @f_8(i32 %a, i1 %cond_1, i1 %cond_2) { +; CHECK-LABEL: @f_8( +entry: + br label %loop + +loop: + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + br i1 undef, label %loop, label %leave + +leave: +; CHECK: leave: +; CHECK: %cond_3 = icmp ult i32 %a, 7 +; CHECK: %wide.chk = and i1 %cond_2, %cond_3 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %loop2, label %leave2 + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ] + br i1 undef, label %loop2, label %leave2 + +loop2: + %cond_3 = icmp ult i32 %a, 7 + call void(i1, ...) @llvm.experimental.guard(i1 %cond_3) [ "deopt"() ] + br label %loop2 + +leave2: + ret void +} + +; In cases like these where there isn't any "obviously profitable" +; widening sites, we refuse to do anything. +define void @f_9(i32 %a, i1 %cond_0, i1 %cond_1) { +; CHECK-LABEL: @f_9( +entry: + br label %first_loop + +first_loop: +; CHECK: first_loop: +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] +; CHECK: br i1 undef, label %first_loop, label %second_loop + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %first_loop, label %second_loop + +second_loop: +; CHECK: second_loop: +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] +; CHECK: br label %second_loop + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + br label %second_loop +} + +; Same situation as in @f_9: no "obviously profitable" widening sites, +; so we refuse to do anything. +define void @f_10(i32 %a, i1 %cond_0, i1 %cond_1) { +; CHECK-LABEL: @f_10( +entry: + br label %loop + +loop: +; CHECK: loop: +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] +; CHECK: br i1 undef, label %loop, label %no_loop + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %loop, label %no_loop + +no_loop: +; CHECK: no_loop: +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] +; CHECK: ret void + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + ret void +} + +; With guards in loops, we're okay hoisting out the guard into the +; containing loop. +define void @f_11(i32 %a, i1 %cond_0, i1 %cond_1) { +; CHECK-LABEL: @f_11( +entry: + br label %inner + +inner: +; CHECK: inner: +; CHECK: %wide.chk = and i1 %cond_0, %cond_1 +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK: br i1 undef, label %inner, label %outer + + call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ] + br i1 undef, label %inner, label %outer + +outer: + call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ] + br label %inner +} + +; Checks that we are adequately guarded against exponential-time +; behavior when hoisting code. +define void @f_12(i32 %a0) { +; CHECK-LABEL: @f_12 + +; Eliding the earlier 29 multiplications for brevity +; CHECK: %a30 = mul i32 %a29, %a29 +; CHECK-NEXT: %cond = trunc i32 %a30 to i1 +; CHECK-NEXT: %wide.chk = and i1 true, %cond +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ] +; CHECK-NEXT: ret void + +entry: + call void(i1, ...) @llvm.experimental.guard(i1 true) [ "deopt"() ] + %a1 = mul i32 %a0, %a0 + %a2 = mul i32 %a1, %a1 + %a3 = mul i32 %a2, %a2 + %a4 = mul i32 %a3, %a3 + %a5 = mul i32 %a4, %a4 + %a6 = mul i32 %a5, %a5 + %a7 = mul i32 %a6, %a6 + %a8 = mul i32 %a7, %a7 + %a9 = mul i32 %a8, %a8 + %a10 = mul i32 %a9, %a9 + %a11 = mul i32 %a10, %a10 + %a12 = mul i32 %a11, %a11 + %a13 = mul i32 %a12, %a12 + %a14 = mul i32 %a13, %a13 + %a15 = mul i32 %a14, %a14 + %a16 = mul i32 %a15, %a15 + %a17 = mul i32 %a16, %a16 + %a18 = mul i32 %a17, %a17 + %a19 = mul i32 %a18, %a18 + %a20 = mul i32 %a19, %a19 + %a21 = mul i32 %a20, %a20 + %a22 = mul i32 %a21, %a21 + %a23 = mul i32 %a22, %a22 + %a24 = mul i32 %a23, %a23 + %a25 = mul i32 %a24, %a24 + %a26 = mul i32 %a25, %a25 + %a27 = mul i32 %a26, %a26 + %a28 = mul i32 %a27, %a27 + %a29 = mul i32 %a28, %a28 + %a30 = mul i32 %a29, %a29 + %cond = trunc i32 %a30 to i1 + call void(i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] + ret void +}