Index: llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/SpeculateAroundPHIs.h @@ -0,0 +1,111 @@ +//===- SpeculateAroundPHIs.h - Speculate around PHIs ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H +#define LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +/// This pass handles simple speculating of instructions around PHIs when +/// doing so is profitable for a particular target despite duplicated +/// instructions. +/// +/// The motivating example are PHIs of constants which will require +/// materializing the constants along each edge. If the PHI is used by an +/// instruction where the target can materialize the constant as part of the +/// instruction, it is profitable to speculate those instructions around the +/// PHI node. This can reduce dynamic instruction count as well as decrease +/// register pressure. +/// +/// Consider this IR for example: +/// ``` +/// entry: +/// br i1 %flag, label %a, label %b +/// +/// a: +/// br label %exit +/// +/// b: +/// br label %exit +/// +/// exit: +/// %p = phi i32 [ 7, %a ], [ 11, %b ] +/// %sum = add i32 %arg, %p +/// ret i32 %sum +/// ``` +/// To materialize the inputs to this PHI node may require an explicit +/// instruction. For example, on x86 this would turn into something like +/// ``` +/// testq %eax, %eax +/// movl $7, %rNN +/// jne .L +/// movl $11, %rNN +/// .L: +/// addl %edi, %rNN +/// movl %rNN, %eax +/// retq +/// ``` +/// When these constants can be folded directly into another instruction, it +/// would be preferable to avoid the potential for register pressure (above we +/// can easily avoid it, but that isn't always true) and simply duplicate the +/// instruction using the PHI: +/// ``` +/// entry: +/// br i1 %flag, label %a, label %b +/// +/// a: +/// %sum.1 = add i32 %arg, 7 +/// br label %exit +/// +/// b: +/// %sum.2 = add i32 %arg, 11 +/// br label %exit +/// +/// exit: +/// %p = phi i32 [ %sum.1, %a ], [ %sum.2, %b ] +/// ret i32 %p +/// ``` +/// Which will generate something like the following on x86: +/// ``` +/// testq %eax, %eax +/// addl $7, %edi +/// jne .L +/// addl $11, %edi +/// .L: +/// movl %edi, %eax +/// retq +/// ``` +/// +/// It is important to note that this pass is never intended to handle more +/// complex cases where speculating around PHIs allows simplifications of the +/// IR itself or other subsequent optimizations. Those can and should already +/// be handled before this pass is ever run by a more powerful analysis that +/// can reason about equivalences and common subexpressions. Classically, those +/// cases would be handled by a GVN-powered PRE or similar transform. This +/// pass, in contrast, is *only* interested in cases where despite no +/// simplifications to the IR itself, speculation is *faster* to execute. The +/// result of this is that the cost models which are appropriate to consider +/// here are relatively simple ones around execution and codesize cost, without +/// any need to consider simplifications or other transformations. +struct SpeculateAroundPHIsPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -129,6 +129,7 @@ #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/Scalar/Sink.h" +#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" @@ -777,6 +778,11 @@ // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. OptimizePM.addPass(SimplifyCFGPass()); + // Optimize PHIs by speculating around them when profitable. Note that this + // pass needs to be run after any PRE or similar pass as it is essentially + // inserting redudnancies into the progrem. This even includes SimplifyCFG. + OptimizePM.addPass(SpeculateAroundPHIsPass()); + // Add the core optimizing pipeline. MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM))); Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -194,6 +194,7 @@ FUNCTION_PASS("sink", SinkingPass()) FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass()) FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass()) +FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass()) FUNCTION_PASS("sroa", SROA()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -61,6 +61,7 @@ SimplifyCFGPass.cpp Sink.cpp SpeculativeExecution.cpp + SpeculateAroundPHIs.cpp StraightLineStrengthReduce.cpp StructurizeCFG.cpp TailRecursionElimination.cpp Index: llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -0,0 +1,798 @@ +//===- SpeculateAroundPHIs.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "spec-phis" + +STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around"); +STATISTIC(NumEdgesSplit, + "Number of critical edges which were split for speculation"); +STATISTIC(NumSpeculatedInstructions, + "Number of instructions we speculated around the PHI nodes"); +STATISTIC(NumNewRedundantInstructions, + "Number of new, redundant instructions inserted"); + +/// Check wether speculating the users of a PHI node around the PHI +/// will be safe. +/// +/// This checks both that all of the users are safe and also that all of their +/// operands are either recursively safe or already available along an incoming +/// edge to the PHI. +/// +/// This routine caches both all the safe nodes explored in `PotentialSpecSet` +/// and the chain of nodes that definitively reach any unsafe node in +/// `UnsafeSet`. By preserving these between repeated calls to this routine for +/// PHIs in the same basic block, the exploration here can be reused. However, +/// these caches must no be reused for PHIs in a different basic block as they +/// reflect what is available along incoming edges. +static bool +isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT, + SmallPtrSetImpl &PotentialSpecSet, + SmallPtrSetImpl &UnsafeSet) { + auto *PhiBB = PN.getParent(); + SmallPtrSet Visited; + SmallVector, 16> DFSStack; + + // Walk each user of the PHI node. + for (auto &U : PN.uses()) { + auto *UI = cast(U.getUser()); + + // FIXME: This check is much too conservative. We're not going to move these + // instructions onto new dynamic paths through the program unless there is + // a call instruction between the use and the PHI node. And memory isn't + // changing unless there is a store in that same sequence. We should + // probably change this to do at least a limited scan of the intervening + // instructions and allow handling stores in easily proven safe cases. + if (mayBeMemoryDependent(*UI)) { + DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n"); + return false; + } + + // Now do a depth-first search of everything these users depend on to make + // sure they are transitively safe. This is a depth-first search, but we + // check nodes in preorder to minimize the amount of checking. + Visited.insert(UI); + DFSStack.push_back({UI, UI->value_op_begin()}); + do { + User::value_op_iterator OpIt; + std::tie(UI, OpIt) = DFSStack.pop_back_val(); + + while (OpIt != UI->value_op_end()) { + auto *OpI = dyn_cast(*OpIt); + // Increment to the next operand for whenever we continue. + ++OpIt; + // No need to visit non-instructions, which can't form dependencies. + if (!OpI) + continue; + + // Now do the main pre-order checks that this operand is a viable + // dependency of something we want to speculate. + + // First do a few checks for instructions that won't require + // speculation at all because they are trivially available on the + // incoming edge (either through dominance or through an incoming value + // to a PHI). + // + // The cases in the current block will be trivially dominated by the + // edge. + auto *ParentBB = OpI->getParent(); + if (ParentBB == PhiBB) { + if (isa(OpI)) { + // We can trivially map through phi nodes in the same block. + continue; + } + } else if (DT.dominates(ParentBB, PhiBB)) { + // Instructions from dominating blocks are already available. + continue; + } + + // Once we know that we're considering speculating the operand, check + // if we've already explored this subgraph and found it to be safe. + if (PotentialSpecSet.count(OpI)) + continue; + + // If we've already explored this subgraph and found it unsafe, bail. + // If when we directly test whether this is safe it fails, bail. + if (UnsafeSet.count(OpI) || mayBeMemoryDependent(*OpI)) { + DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI + << "\n"); + // Record the stack of instructions which reach this node as unsafe + // so we prune subsequent searches. + UnsafeSet.insert(OpI); + for (auto &StackPair : DFSStack) { + Instruction *I = StackPair.first; + UnsafeSet.insert(I); + } + return false; + } + + // Skip any operands we're already recursively checking. + if (!Visited.insert(OpI).second) + continue; + + // Push onto the stack and descend. We can directly continue this + // loop when ascending. + DFSStack.push_back({UI, OpIt}); + UI = OpI; + OpIt = OpI->value_op_begin(); + } + + // This node and all its operands are safe. Go ahead and cache that for + // reuse later. + PotentialSpecSet.insert(UI); + + // Continue with the next node on the stack. + } while (!DFSStack.empty()); + } + + // Every visited operand should have been marked as safe for speculation at + // this point. Verify this and return success. + for (auto *I : Visited) + assert(PotentialSpecSet.count(I) && + "Failed to mark a visited instruction as safe!"); + return true; +} + +/// Check whether, in isolation, a given PHI node is both safe and profitable +/// to speculate users around. +/// +/// This handles checking whether there are any constant operands to a PHI +/// which could represent a useful speculation candidate, whether the users of +/// the PHI are safe to speculate including all their transitive dependencies, +/// and whether after speculation there will be some cost savings (profit) to +/// folding the operands into the users of the PHI node. Returns true if both +/// safe and profitable with relevant cost savings updated in the map and with +/// an update to the `PotentialSpecSet`. Returns false if either safety or +/// profitability are absent. Some new entries may be made to the +/// `PotentialSpecSet` even when this routine returns false, but they remain +/// conservatively correct. +/// +/// The profitability check here is a local one, but it checks this in an +/// interesting way. Beyond checking that the total cost of materializing the +/// constants will be less than the cost of folding them into their users, it +/// also checks that no one incoming constant will have a higher cost when +/// folded into its users rather than materialized. This higher cost could +/// result in a dynamic *path* that is more expensive even when the total cost +/// is lower. Currently, all of the interesting cases where this optimization +/// should fire are ones where it is a no-loss operation in this sense. If we +/// ever want to be more aggressive here, we would need to balance the +/// different incoming edges' cost by looking at their respective +/// probabilities. +static bool isSafeAndProfitableToSpeculateAroundPHI( + PHINode &PN, SmallDenseMap &CostSavingsMap, + SmallPtrSetImpl &PotentialSpecSet, + SmallPtrSetImpl &UnsafeSet, DominatorTree &DT, + TargetTransformInfo &TTI) { + // First see whether there is any cost savings to speculating around this + // PHI, and build up a map of the constant inputs to how many times they + // occur. + bool NonFreeMat = false; + struct CostsAndCount { + int MatCost = TargetTransformInfo::TCC_Free; + int FoldedCost = TargetTransformInfo::TCC_Free; + int Count = 0; + }; + SmallDenseMap CostsAndCounts; + SmallPtrSet IncomingConstantBlocks; + for (int i : llvm::seq(0, PN.getNumIncomingValues())) { + auto *IncomingC = dyn_cast(PN.getIncomingValue(i)); + if (!IncomingC) + continue; + + // Only visit each incoming edge with a constant input once. + if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second) + continue; + + auto InsertResult = CostsAndCounts.insert({IncomingC, {}}); + // Count how many edges share a given incoming costant. + ++InsertResult.first->second.Count; + // Only compute the cost the first time we see a particular constant. + if (!InsertResult.second) + continue; + + int &MatCost = InsertResult.first->second.MatCost; + MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType()); + NonFreeMat |= MatCost != TTI.TCC_Free; + } + if (!NonFreeMat) { + DEBUG(dbgs() << " Free: " << PN << "\n"); + // No profit in free materialization. + return false; + } + + // Now check that the uses of this PHI can actually be speculated, + // otherwise we'll still have to materialize the PHI value. + if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) { + DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n"); + return false; + } + + // Compute how much (if any) savings are available by speculating around this + // PHI. + for (Use &U : PN.uses()) { + auto *UserI = cast(U.getUser()); + // Now check whether there is any savings to folding the incoming constants + // into this use. + unsigned Idx = U.getOperandNo(); + + // If we have a binary operator that is commutative, an actual constant + // operand would end up on the RHS, so pretend the use of the PHI is on the + // RHS. + // + // Technically, this is a bit weird if *both* operands are PHIs we're + // speculating. But if that is the case, giving an "optimistic" cost isn't + // a bad thing because after speculation it will constant fold. And + // moreover, such cases should likely have been constant folded already by + // some other pass, so we shouldn't worry about "modeling" them terribly + // accurately here. Similarly, if the other operand is a constant, it still + // seems fine to be "optimistic" in our cost modeling, because when the + // incoming operand from the PHI node is also a constant, we will end up + // constant folding. + if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1) + // Assume we will commute the constant to the RHS to be canonical. + Idx = 1; + + // Get the intrinsic ID if this user is an instrinsic. + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (auto *UserII = dyn_cast(UserI)) + IID = UserII->getIntrinsicID(); + + for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) { + ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first; + int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; + int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; + if (IID) + FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(), + IncomingC->getType()); + else + FoldedCost += + TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(), + IncomingC->getType()); + + // If we accumulate more folded cost for this incoming constant than + // materialized cost, then we'll regress any edge with this constant so + // just bail. We're only interested in cases where folding the incoming + // constants is at least break-even on all paths. + if (FoldedCost > MatCost) { + DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n" + " Materializing cost: " << MatCost << "\n" + " Accumulated folded cost: " << FoldedCost << "\n"); + return false; + } + } + } + + // Compute the total cost savings afforded by this PHI node. + int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free; + for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) { + int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; + int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; + int Count = IncomingConstantAndCostsAndCount.second.Count; + + TotalMatCost += MatCost * Count; + TotalFoldedCost += FoldedCost * Count; + } + assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is " + "less that its materialized cost, " + "the sum must be as well."); + + DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost) + << ": " << PN << "\n"); + CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost; + return true; +} + +/// Simple helper to walk all the users of a list of phis depth first, and call +/// a visit function on each one in post-order. +/// +/// All of the PHIs should be in the same basic block, and this is primarily +/// used to make a single depth-first walk across their collective users +/// without revisiting any subgraphs. Callers should provide a fast, idempotent +/// callable to test whether a node has been visited and the more important +/// callable to actually visit a particular node. +/// +/// Depth-first and postorder here refer to the *operand* graph -- we start +/// from a collection of users of PHI nodes and walk "up" the operands +/// depth-first. +template +static void visitPHIUsersAndDepsInPostOrder(ArrayRef PNs, + IsVisitedT IsVisited, + VisitT Visit) { + SmallVector, 16> DFSStack; + for (auto *PN : PNs) + for (auto &U : PN->uses()) { + auto *UI = cast(U.getUser()); + if (IsVisited(UI)) + // Already visited this user, continue across the roots. + continue; + + // Otherwise, walk the operand graph depth-first and visit each + // dependency in postorder. + DFSStack.push_back({UI, UI->value_op_begin()}); + do { + User::value_op_iterator OpIt; + std::tie(UI, OpIt) = DFSStack.pop_back_val(); + while (OpIt != UI->value_op_end()) { + auto *OpI = dyn_cast(*OpIt); + // Increment to the next operand for whenever we continue. + ++OpIt; + // No need to visit non-instructions, which can't form dependencies, + // or instructions outside of our potential dependency set that we + // were given. Finally, if we've already visited the node, continue + // to the next. + if (!OpI || IsVisited(OpI)) + continue; + + // Push onto the stack and descend. We can directly continue this + // loop when ascending. + DFSStack.push_back({UI, OpIt}); + UI = OpI; + OpIt = OpI->value_op_begin(); + } + + // Finished visiting children, visit this node. + assert(!IsVisited(UI) && "Should not have already visited a node!"); + Visit(UI); + } while (!DFSStack.empty()); + } +} + +/// Find profitable PHIs to speculate. +/// +/// For a PHI node to be profitable, we need the cost of speculating its users +/// (and their dependencies) to not exceed the savings of folding the PHI's +/// constant operands into the speculated users. +/// +/// Computing this is surprisingly challenging. Because users of two different +/// PHI nodes can depend on each other or on common other instructions, it may +/// be profitable to speculate two PHI nodes together even though neither one +/// in isolation is profitable. The straightforward way to find all the +/// profitable PHIs would be to check each combination of PHIs' cost, but this +/// is exponential in complexity. +/// +/// Even if we assume that we only care about cases where we can consider each +/// PHI node in isolation (rather than considering cases where none are +/// profitable in isolation but some subset are profitable as a set), we still +/// have a challenge. The obvious way to find all individually profitable PHIs +/// is to iterate until reaching a fixed point, but this will be quadratic in +/// complexity. =/ +/// +/// This code currently uses a linear-to-compute order for a greedy approach. +/// It won't find cases where a set of PHIs must be considered together, but it +/// handles most cases of order dependence without quadratic iteration. The +/// specific order used is the post-order across the operand DAG. When the last +/// user of a PHI is visited in this postorder walk, we check it for +/// profitability. +/// +/// There is an orthogonal extra complexity to all of this: computing the cost +/// itself can easily become a linear computation making everything again (at +/// best) quadratic. Using a postorder over the operand graph makes it +/// particularly easy to avoid this through dynamic programming. As we do the +/// postorder walk, we build the transitive cost of that subgraph. It is also +/// straightforward to then update these costs when we mark a PHI for +/// speculation so that subsequent PHIs don't re-pay the cost of already +/// speculated instructions. +static SmallVector +findProfitablePHIs(ArrayRef PNs, + const SmallDenseMap &CostSavingsMap, + const SmallPtrSetImpl &PotentialSpecSet, + int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) { + SmallVector SpecPNs; + + // First, establish a reverse mapping from immediate users of the PHI nodes + // to the nodes themselves, and count how many users each PHI node has in + // a way we can update while processing them. + SmallDenseMap, 16> UserToPNMap; + SmallDenseMap PNUserCountMap; + SmallPtrSet UserSet; + for (auto *PN : PNs) { + assert(UserSet.empty() && "Must start with an empty user set!"); + for (Use &U : PN->uses()) + UserSet.insert(cast(U.getUser())); + PNUserCountMap[PN] = UserSet.size(); + for (auto *UI : UserSet) + UserToPNMap.insert({UI, {}}).first->second.push_back(PN); + UserSet.clear(); + } + + // Now do a DFS across the operand graph of the users, computing cost as we + // go and when all costs for a given PHI are known, checking that PHI for + // profitability. + SmallDenseMap SpecCostMap; + visitPHIUsersAndDepsInPostOrder( + PNs, + /*IsVisited*/ + [&](Instruction *I) { + // We consider anything that isn't potentially speculated to be + // "visited" as it is already handled. Similarly, anything that *is* + // potentially speculated but for which we have an entry in our cost + // map, we're done. + return !PotentialSpecSet.count(I) || SpecCostMap.count(I); + }, + /*Visit*/ + [&](Instruction *I) { + // We've fully visited the operands, so sum their cost with this node + // and update the cost map. + int Cost = TTI.TCC_Free; + for (Value *OpV : I->operand_values()) + if (auto *OpI = dyn_cast(OpV)) { + auto CostMapIt = SpecCostMap.find(OpI); + if (CostMapIt != SpecCostMap.end()) + Cost += CostMapIt->second; + } + Cost += TTI.getUserCost(I); + bool Inserted = SpecCostMap.insert({I, Cost}).second; + (void)Inserted; + assert(Inserted && "Must not re-insert a cost during the DFS!"); + + // Now check if this node had a corresponding PHI node using it. If so, + // we need to decrement the outstanding user count for it. + auto UserPNsIt = UserToPNMap.find(I); + if (UserPNsIt == UserToPNMap.end()) + return; + auto &UserPNs = UserPNsIt->second; + auto UserPNsSplitIt = std::stable_partition( + UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) { + int &PNUserCount = PNUserCountMap.find(UserPN)->second; + assert( + PNUserCount > 0 && + "Should never re-visit a PN after its user count hits zero!"); + --PNUserCount; + return PNUserCount != 0; + }); + + // FIXME: Rather than one at a time, we should sum the savings as the + // cost will be completely shared. + SmallVector SpecWorklist; + for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) { + int SpecCost = TTI.TCC_Free; + for (Use &U : PN->uses()) + SpecCost += + SpecCostMap.find(cast(U.getUser()))->second; + SpecCost *= (NumPreds - 1); + // When the user count of a PHI node hits zero, we should check its + // profitability. If profitable, we should mark it for speculation + // and zero out the cost of everything it depends on. + int CostSavings = CostSavingsMap.find(PN)->second; + if (SpecCost > CostSavings) { + DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n" + " Cost savings: " << CostSavings << "\n" + " Speculation cost: " << SpecCost << "\n"); + continue; + } + + // We're going to speculate this user-associated PHI. Copy it out and + // add its users to the worklist to update their cost. + SpecPNs.push_back(PN); + for (Use &U : PN->uses()) { + auto *UI = cast(U.getUser()); + auto CostMapIt = SpecCostMap.find(UI); + if (CostMapIt->second == 0) + continue; + // Zero out this cost entry to avoid duplicates. + CostMapIt->second = 0; + SpecWorklist.push_back(UI); + } + } + + // Now walk all the operands of the users in the worklist transitively + // to zero out all the memoized costs. + while (!SpecWorklist.empty()) { + Instruction *SpecI = SpecWorklist.pop_back_val(); + assert(SpecCostMap.find(SpecI)->second == 0 && + "Didn't zero out a cost!"); + + // Walk the operands recursively to zero out their cost as well. + for (auto *OpV : SpecI->operand_values()) { + auto *OpI = dyn_cast(OpV); + if (!OpI) + continue; + auto CostMapIt = SpecCostMap.find(OpI); + if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0) + continue; + CostMapIt->second = 0; + SpecWorklist.push_back(OpI); + } + } + }); + + return SpecPNs; +} + +/// Speculate users around a set of PHI nodes. +/// +/// This routine does the actual speculation around a set of PHI nodes where we +/// have determined this to be both safe and profitable. +/// +/// This routine handles any spliting of critical edges necessary to create +/// a safe block to speculate into as well as cloning the instructions and +/// rewriting all uses. +static void speculatePHIs(ArrayRef SpecPNs, + SmallPtrSetImpl &PotentialSpecSet, + SmallSetVector &PredSet, + DominatorTree &DT) { + DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n"); + NumPHIsSpeculated += SpecPNs.size(); + + // Split any critical edges so that we have a block to hoist into. + auto *ParentBB = SpecPNs[0]->getParent(); + SmallVector SpecPreds; + SpecPreds.reserve(PredSet.size()); + for (auto *PredBB : PredSet) { + auto *NewPredBB = SplitCriticalEdge( + PredBB, ParentBB, + CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges()); + if (NewPredBB) { + ++NumEdgesSplit; + DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName() + << "\n"); + SpecPreds.push_back(NewPredBB); + } else { + assert(PredBB->getSingleSuccessor() == ParentBB && + "We need a non-critical predecessor to speculate into."); + assert(!isa(PredBB->getTerminator()) && + "Cannot have a non-critical invoke!"); + + // Already non-critical, use existing pred. + SpecPreds.push_back(PredBB); + } + } + + SmallPtrSet SpecSet; + SmallVector SpecList; + visitPHIUsersAndDepsInPostOrder(SpecPNs, + /*IsVisited*/ + [&](Instruction *I) { + // This is visited if we don't need to + // speculate it or we already have + // speculated it. + return !PotentialSpecSet.count(I) || + SpecSet.count(I); + }, + /*Visit*/ + [&](Instruction *I) { + // All operands scheduled, schedule this + // node. + SpecSet.insert(I); + SpecList.push_back(I); + }); + + int NumSpecInsts = SpecList.size() * SpecPreds.size(); + int NumRedundantInsts = NumSpecInsts - SpecList.size(); + DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, " + << NumRedundantInsts << " redundancies\n"); + NumSpeculatedInstructions += NumSpecInsts; + NumNewRedundantInstructions += NumRedundantInsts; + + // Each predecessor is numbered by its index in `SpecPreds`, so for each + // instruction we speculate, the speculated instruction is stored in that + // index of the vector asosciated with the original instruction. We also + // store the incoming values for each predecessor from any PHIs used. + SmallDenseMap, 16> SpeculatedValueMap; + + // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming + // value. This handles both the PHIs we are speculating around and any other + // PHIs that happen to be used. + for (auto *OrigI : SpecList) + for (auto *OpV : OrigI->operand_values()) { + auto *OpPN = dyn_cast(OpV); + if (!OpPN || OpPN->getParent() != ParentBB) + continue; + + auto InsertResult = SpeculatedValueMap.insert({OpPN, {}}); + if (!InsertResult.second) + continue; + + auto &SpeculatedVals = InsertResult.first->second; + + // Populating our structure for mapping is particularly annoying because + // finding an incoming value for a particular predecessor block in a PHI + // node is a linear time operation! To avoid quadratic behavior, we build + // a map for this PHI node's incoming values and then translate it into + // the more compact representation used below. + SmallDenseMap IncomingValueMap; + for (int i : llvm::seq(0, OpPN->getNumIncomingValues())) + IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i); + + for (auto *PredBB : SpecPreds) + SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second); + } + + // Speculate into each predecessor. + for (int PredIdx : llvm::seq(0, SpecPreds.size())) { + auto *PredBB = SpecPreds[PredIdx]; + assert(PredBB->getSingleSuccessor() == ParentBB && + "We need a non-critical predecessor to speculate into."); + + for (auto *OrigI : SpecList) { + auto *NewI = OrigI->clone(); + NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx)); + NewI->insertBefore(PredBB->getTerminator()); + + // Rewrite all the operands to the previously speculated instructions. + // Because we're walking in-order, the defs must precede the uses and we + // should already have these mappings. + for (Use &U : NewI->operands()) { + auto *OpI = dyn_cast(U.get()); + if (!OpI) + continue; + auto MapIt = SpeculatedValueMap.find(OpI); + if (MapIt == SpeculatedValueMap.end()) + continue; + const auto &SpeculatedVals = MapIt->second; + assert(SpeculatedVals[PredIdx] && + "Must have a speculated value for this predecessor!"); + assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() && + "Speculated value has the wrong type!"); + + // Rewrite the use to this predecessor's speculated instruction. + U.set(SpeculatedVals[PredIdx]); + } + + // Commute instructions which now have a constant in the LHS but not the + // RHS. + if (NewI->isBinaryOp() && NewI->isCommutative() && + isa(NewI->getOperand(0)) && + !isa(NewI->getOperand(1))) + NewI->getOperandUse(0).swap(NewI->getOperandUse(1)); + + SpeculatedValueMap[OrigI].push_back(NewI); + assert(SpeculatedValueMap[OrigI][PredIdx] == NewI && + "Mismatched speculated instruction index!"); + } + } + + // Walk the speculated instruction list and if they have uses, insert a PHI + // for them from the speculated versions, and replace the uses with the PHI. + // Then erase the instructions as they have been fully speculated. The walk + // needs to be in reverse so that we don't think there are users when we'll + // actually eventually remove them later. + IRBuilder<> IRB(SpecPNs[0]); + for (auto *OrigI : llvm::reverse(SpecList)) { + // Check if we need a PHI for any remaining users and if so, insert it. + if (!OrigI->use_empty()) { + auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(), + Twine(OrigI->getName()) + ".phi"); + // Add the incoming values we speculated. + auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second; + for (int PredIdx : llvm::seq(0, SpecPreds.size())) + SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]); + + // And replace the uses with the PHI node. + OrigI->replaceAllUsesWith(SpecIPN); + } + + // It is important to immediately erase this so that it stops using other + // instructions. This avoids inserting needless PHIs of them. + OrigI->eraseFromParent(); + } + + // All of the uses of the speculated phi nodes should be removed at this + // point, so erase them. + for (auto *SpecPN : SpecPNs) { + assert(SpecPN->use_empty() && "All users should have been speculated!"); + SpecPN->eraseFromParent(); + } +} + +/// Try to speculate around a series of PHIs from a single basic block. +/// +/// This routine checks whether any of these PHIs are profitable to speculate +/// users around. If safe and profitable, it does the speculation. It returns +/// true when at least some speculation occurs. +static bool tryToSpeculatePHIs(SmallVectorImpl &PNs, + DominatorTree &DT, TargetTransformInfo &TTI) { + DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n"); + + // Savings in cost from speculating around a PHI node. + SmallDenseMap CostSavingsMap; + + // Remember the set of instructions that are candidates for speculation so + // that we can quickly walk things within that space. This prunes out + // instructions already available along edges, etc. + SmallPtrSet PotentialSpecSet; + + // Remember the set of instructions that are (transitively) unsafe to + // speculate into the incoming edges of this basic block. This avoids + // recomputing them for each PHI node we check. This set is specific to this + // block though as things are pruned out of it based on what is available + // along incoming edges. + SmallPtrSet UnsafeSet; + + // For each PHI node in this block, check whether there are immediate folding + // opportunities from speculation, and whether that speculation will be + // valid. This determise the set of safe PHIs to speculate. + PNs.erase(llvm::remove_if(PNs, + [&](PHINode *PN) { + return !isSafeAndProfitableToSpeculateAroundPHI( + *PN, CostSavingsMap, PotentialSpecSet, + UnsafeSet, DT, TTI); + }), + PNs.end()); + // If no PHIs were profitable, skip. + if (PNs.empty()) { + DEBUG(dbgs() << " No safe and profitable PHIs found!\n"); + return false; + } + + // We need to know how much speculation will cost which is determined by how + // many incoming edges will need a copy of each speculated instruction. + SmallSetVector PredSet; + for (auto *PredBB : PNs[0]->blocks()) { + if (!PredSet.insert(PredBB)) + continue; + + // We cannot speculate when a predecessor is an indirect branch. + // FIXME: We also can't reliably create a non-critical edge block for + // speculation if the predecessor is an invoke. This doesn't seem + // fundamental and we should probably be splitting critical edges + // differently. + if (isa(PredBB->getTerminator()) || + isa(PredBB->getTerminator())) { + DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName() + << "\n"); + return false; + } + } + if (PredSet.size() < 2) { + DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n"); + return false; + } + + SmallVector SpecPNs = findProfitablePHIs( + PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI); + if (SpecPNs.empty()) + // Nothing to do. + return false; + + speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT); + return true; +} + +PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult(F); + auto &TTI = AM.getResult(F); + + bool Changed = false; + for (auto *BB : ReversePostOrderTraversal(&F)) { + SmallVector PNs; + auto BBI = BB->begin(); + while (auto *PN = dyn_cast(&*BBI)) { + PNs.push_back(PN); + ++BBI; + } + + if (PNs.empty()) + continue; + + Changed |= tryToSpeculatePHIs(PNs, DT, TTI); + } + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + return PA; +} Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -208,6 +208,7 @@ ; CHECK-O-NEXT: Running pass: InstSimplifierPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: SpeculateAroundPHIsPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-O-NEXT: Running pass: ConstantMergePass Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -196,6 +196,7 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Running pass: SpeculateAroundPHIsPass ; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass Index: llvm/test/Transforms/SpeculateAroundPHIs/X86/basic.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SpeculateAroundPHIs/X86/basic.ll @@ -0,0 +1,497 @@ +; Test the basic functionality of speculating around PHI nodes based on reduced +; cost of the constant operands to the PHI nodes. +; +; RUN: opt -S -passes=spec-phis < %s | FileCheck %s + +target triple = "x86_64-unknown-unknown" + +define i32 @test_basic(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_basic( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 7 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg, 11 +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +; Check that we handle commuted operands and get the constant onto the RHS. +define i32 @test_commuted(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_commuted( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 7 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg, 11 +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %p, %arg + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +define i32 @test_no_spec_dominating_inst(i1 %flag, i32* %ptr) { +; CHECK-LABEL: define i32 @test_no_spec_dominating_inst( +entry: + %load = load i32, i32* %ptr + br i1 %flag, label %a, label %b +; CHECK: %[[LOAD:.*]] = load i32, i32* %ptr +; CHECK-NEXT: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[LOAD]], 7 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[LOAD]], 11 +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %load, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +; We have special logic handling PHI nodes, make sure it doesn't get confused +; by a dominating PHI. +define i32 @test_no_spec_dominating_phi(i1 %flag1, i1 %flag2, i32 %x, i32 %y) { +; CHECK-LABEL: define i32 @test_no_spec_dominating_phi( +entry: + br i1 %flag1, label %x.block, label %y.block +; CHECK: entry: +; CHECK-NEXT: br i1 %flag1, label %x.block, label %y.block + +x.block: + br label %merge +; CHECK: x.block: +; CHECK-NEXT: br label %merge + +y.block: + br label %merge +; CHECK: y.block: +; CHECK-NEXT: br label %merge + +merge: + %xy.phi = phi i32 [ %x, %x.block ], [ %y, %y.block ] + br i1 %flag2, label %a, label %b +; CHECK: merge: +; CHECK-NEXT: %[[XY_PHI:.*]] = phi i32 [ %x, %x.block ], [ %y, %y.block ] +; CHECK-NEXT: br i1 %flag2, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[XY_PHI]], 7 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[XY_PHI]], 11 +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %xy.phi, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[SUM_PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ] +; CHECK-NEXT: ret i32 %[[SUM_PHI]] +} + +; Ensure that we will speculate some number of "free" instructions on the given +; architecture even though they are unrelated to the PHI itself. +define i32 @test_speculate_free_insts(i1 %flag, i64 %arg) { +; CHECK-LABEL: define i32 @test_speculate_free_insts( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[T1_A:.*]] = trunc i64 %arg to i48 +; CHECK-NEXT: %[[T2_A:.*]] = trunc i48 %[[T1_A]] to i32 +; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[T2_A]], 7 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[T1_B:.*]] = trunc i64 %arg to i48 +; CHECK-NEXT: %[[T2_B:.*]] = trunc i48 %[[T1_B]] to i32 +; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[T2_B]], 11 +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %t1 = trunc i64 %arg to i48 + %t2 = trunc i48 %t1 to i32 + %sum = add i32 %t2, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +; We shouldn't speculate multiple uses even if each individually looks +; profitable because of the total cost. +define i32 @test_no_spec_multi_uses(i1 %flag, i32 %arg1, i32 %arg2, i32 %arg3) { +; CHECK-LABEL: define i32 @test_no_spec_multi_uses( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %add1 = add i32 %arg1, %p + %add2 = add i32 %arg2, %p + %add3 = add i32 %arg3, %p + %sum1 = add i32 %add1, %add2 + %sum2 = add i32 %sum1, %add3 + ret i32 %sum2 +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ] +; CHECK-NEXT: %[[ADD1:.*]] = add i32 %arg1, %[[PHI]] +; CHECK-NEXT: %[[ADD2:.*]] = add i32 %arg2, %[[PHI]] +; CHECK-NEXT: %[[ADD3:.*]] = add i32 %arg3, %[[PHI]] +; CHECK-NEXT: %[[SUM1:.*]] = add i32 %[[ADD1]], %[[ADD2]] +; CHECK-NEXT: %[[SUM2:.*]] = add i32 %[[SUM1]], %[[ADD3]] +; CHECK-NEXT: ret i32 %[[SUM2]] +} + +define i32 @test_multi_phis1(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_multi_phis1( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A1:.*]] = add i32 %arg, 1 +; CHECK-NEXT: %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3 +; CHECK-NEXT: %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B1:.*]] = add i32 %arg, 2 +; CHECK-NEXT: %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4 +; CHECK-NEXT: %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6 +; CHECK-NEXT: br label %exit + +exit: + %p1 = phi i32 [ 1, %a ], [ 2, %b ] + %p2 = phi i32 [ 3, %a ], [ 4, %b ] + %p3 = phi i32 [ 5, %a ], [ 6, %b ] + %sum1 = add i32 %arg, %p1 + %sum2 = add i32 %sum1, %p2 + %sum3 = add i32 %sum2, %p3 + ret i32 %sum3 +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +; Check that the order of the PHIs doesn't impact the behavior. +define i32 @test_multi_phis2(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_multi_phis2( +entry: + br i1 %flag, label %a, label %b +; CHECK: br i1 %flag, label %a, label %b + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: %[[SUM_A1:.*]] = add i32 %arg, 1 +; CHECK-NEXT: %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3 +; CHECK-NEXT: %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5 +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: %[[SUM_B1:.*]] = add i32 %arg, 2 +; CHECK-NEXT: %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4 +; CHECK-NEXT: %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6 +; CHECK-NEXT: br label %exit + +exit: + %p3 = phi i32 [ 5, %a ], [ 6, %b ] + %p2 = phi i32 [ 3, %a ], [ 4, %b ] + %p1 = phi i32 [ 1, %a ], [ 2, %b ] + %sum1 = add i32 %arg, %p1 + %sum2 = add i32 %sum1, %p2 + %sum3 = add i32 %sum2, %p3 + ret i32 %sum3 +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ] +; CHECK-NEXT: ret i32 %[[PHI]] +} + +define i32 @test_no_spec_indirectbr(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_no_spec_indirectbr( +entry: + br i1 %flag, label %a, label %b +; CHECK: entry: +; CHECK-NEXT: br i1 %flag, label %a, label %b + +a: + indirectbr i8* undef, [label %exit] +; CHECK: a: +; CHECK-NEXT: indirectbr i8* undef, [label %exit] + +b: + indirectbr i8* undef, [label %exit] +; CHECK: b: +; CHECK-NEXT: indirectbr i8* undef, [label %exit] + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ] +; CHECK-NEXT: %[[SUM:.*]] = add i32 %arg, %[[PHI]] +; CHECK-NEXT: ret i32 %[[SUM]] +} + +declare void @g() + +declare i32 @__gxx_personality_v0(...) + +; FIXME: We should be able to handle this case -- only the exceptional edge is +; impossible to split. +define i32 @test_no_spec_invoke_continue(i1 %flag, i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: define i32 @test_no_spec_invoke_continue( +entry: + br i1 %flag, label %a, label %b +; CHECK: entry: +; CHECK-NEXT: br i1 %flag, label %a, label %b + +a: + invoke void @g() + to label %exit unwind label %lpad +; CHECK: a: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %exit unwind label %lpad + +b: + invoke void @g() + to label %exit unwind label %lpad +; CHECK: b: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %exit unwind label %lpad + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %sum = add i32 %arg, %p + ret i32 %sum +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ] +; CHECK-NEXT: %[[SUM:.*]] = add i32 %arg, %[[PHI]] +; CHECK-NEXT: ret i32 %[[SUM]] + +lpad: + %lp = landingpad { i8*, i32 } + cleanup + resume { i8*, i32 } undef +} + +define i32 @test_no_spec_landingpad(i32 %arg, i32* %ptr) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: define i32 @test_no_spec_landingpad( +entry: + invoke void @g() + to label %invoke.cont unwind label %lpad +; CHECK: entry: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %invoke.cont unwind label %lpad + +invoke.cont: + invoke void @g() + to label %exit unwind label %lpad +; CHECK: invoke.cont: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %exit unwind label %lpad + +lpad: + %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ] + %lp = landingpad { i8*, i32 } + cleanup + %sum = add i32 %arg, %p + store i32 %sum, i32* %ptr + resume { i8*, i32 } undef +; CHECK: lpad: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ] + +exit: + ret i32 0 +} + +declare i32 @__CxxFrameHandler3(...) + +define i32 @test_no_spec_cleanuppad(i32 %arg, i32* %ptr) personality i32 (...)* @__CxxFrameHandler3 { +; CHECK-LABEL: define i32 @test_no_spec_cleanuppad( +entry: + invoke void @g() + to label %invoke.cont unwind label %lpad +; CHECK: entry: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %invoke.cont unwind label %lpad + +invoke.cont: + invoke void @g() + to label %exit unwind label %lpad +; CHECK: invoke.cont: +; CHECK-NEXT: invoke void @g() +; CHECK-NEXT: to label %exit unwind label %lpad + +lpad: + %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ] + %cp = cleanuppad within none [] + %sum = add i32 %arg, %p + store i32 %sum, i32* %ptr + cleanupret from %cp unwind to caller +; CHECK: lpad: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ] + +exit: + ret i32 0 +} + +; Check that we don't fall over when confronted with seemingly reasonable code +; for us to handle but in an unreachable region and with non-PHI use-def +; cycles. +define i32 @test_unreachable_non_phi_cycles(i1 %flag, i32 %arg) { +; CHECK-LABEL: define i32 @test_unreachable_non_phi_cycles( +entry: + ret i32 42 +; CHECK: entry: +; CHECK-NEXT: ret i32 42 + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: br label %exit + +exit: + %p = phi i32 [ 7, %a ], [ 11, %b ] + %zext = zext i32 %sum to i64 + %trunc = trunc i64 %zext to i32 + %sum = add i32 %trunc, %p + br i1 %flag, label %a, label %b +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ] +; CHECK-NEXT: %[[ZEXT:.*]] = zext i32 %[[SUM:.*]] to i64 +; CHECK-NEXT: %[[TRUNC:.*]] = trunc i64 %[[ZEXT]] to i32 +; CHECK-NEXT: %[[SUM]] = add i32 %[[TRUNC]], %[[PHI]] +; CHECK-NEXT: br i1 %flag, label %a, label %b +} + +; Check that we don't speculate in the face of an expensive immediate. There +; are two reasons this should never speculate. First, even a local analysis +; should fail because it makes some paths (%a) potentially more expensive due +; to multiple uses of the immediate. Additionally, when we go to speculate the +; instructions, their cost will also be too high. +; FIXME: The goal is really to test the first property, but there doesn't +; happen to be any way to use free-to-speculate instructions here so that it +; would be the only interesting property. +define i64 @test_expensive_imm(i32 %flag, i64 %arg) { +; CHECK-LABEL: define i64 @test_expensive_imm( +entry: + switch i32 %flag, label %a [ + i32 1, label %b + i32 2, label %c + i32 3, label %d + ] +; CHECK: switch i32 %flag, label %a [ +; CHECK-NEXT: i32 1, label %b +; CHECK-NEXT: i32 2, label %c +; CHECK-NEXT: i32 3, label %d +; CHECK-NEXT: ] + +a: + br label %exit +; CHECK: a: +; CHECK-NEXT: br label %exit + +b: + br label %exit +; CHECK: b: +; CHECK-NEXT: br label %exit + +c: + br label %exit +; CHECK: c: +; CHECK-NEXT: br label %exit + +d: + br label %exit +; CHECK: d: +; CHECK-NEXT: br label %exit + +exit: + %p = phi i64 [ 4294967296, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ] + %sum1 = add i64 %arg, %p + %sum2 = add i64 %sum1, %p + ret i64 %sum2 +; CHECK: exit: +; CHECK-NEXT: %[[PHI:.*]] = phi i64 [ {{[0-9]+}}, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ] +; CHECK-NEXT: %[[SUM1:.*]] = add i64 %arg, %[[PHI]] +; CHECK-NEXT: %[[SUM2:.*]] = add i64 %[[SUM1]], %[[PHI]] +; CHECK-NEXT: ret i64 %[[SUM2]] +} Index: llvm/test/Transforms/SpeculateAroundPHIs/X86/lit.local.cfg =================================================================== --- /dev/null +++ llvm/test/Transforms/SpeculateAroundPHIs/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True