Index: llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -14,7 +14,7 @@ // diamond (hammock) and merges them into a single store to the tail block // (footer). The algorithm iterates over the instructions of one side of the // diamond and attempts to find a matching store on the other side. New -// tail/footer block may be insterted if the tail/footer block has more +// tail/footer block may be inserted if the tail/footer block has more // predecessors (not only the two predecessors that are forming the diamond). It // sinks when it thinks it safe to do so. This optimization helps with eg. // hiding load latencies, triggering if-conversion, and reducing static code @@ -62,15 +62,53 @@ // <...> // // +// This pass also replaces conditional stores with unconditional ones (subject to +// constraints). +// Given a triangle shape before the transformation: +// +// header: +// br %cond, label %if.then, label %if.end +// + + +// + + +// + + +// if.then: + +// store %s, %addr + +// br label %if.end + +// + + +// + + +// + + +// if.end ("footer"): +// <...> +// +// After the conditional store replacement, this becomes: +// header: +// br %cond, label %if.then, label %if.else +// + + +// + + +// + + +// if.then: if.else: +// br label %if.end load %s.old, %addr +// br label %if.end +// + + +// + + +// + + +// if.end ("footer"): +// %s.new = phi [%s, if.then], [%s.old, if.else] +// <...> +// store %s.new, %addr_s +// <...> +// //===----------------------- TODO -----------------------------------------===// // // 1) Generalize to regions other than diamonds // 2) Be more aggressive merging memory operations +// 3) Add conditional store replacement for stores to non-local addresses // Note that both changes require register pressure control // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" +#include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" @@ -95,6 +133,10 @@ class MergedLoadStoreMotion { AliasAnalysis *AA = nullptr; + // The set of stores considered safe to turn from conditional to + // unconditional. + DenseSet SafeStores; + // The mergeStores algorithms could have Size0 * Size1 complexity, // where Size0 and Size1 are the number instructions on the two sides of // the diamond. The constant chosen here is arbitrary. Compiler Time @@ -104,11 +146,12 @@ const bool SplitFooterBB; public: MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} - std::pair run(Function &F, AliasAnalysis &AA); + std::pair run(Function &F, AliasAnalysis &AA, DominatorTree &DT); private: bool isDiamondHead(BasicBlock *BB, BasicBlock *&Left, BasicBlock *&Right, BasicBlock *&Tail); + bool isTriangleHead(BasicBlock *BB, BasicBlock *&Side, BasicBlock *&Tail); // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); @@ -119,6 +162,9 @@ StoreInst *ElseInst); std::pair mergeStores(BasicBlock *Head, BasicBlock *Left, BasicBlock *Right, BasicBlock *Tail); + bool isEscaping(const AllocaInst *AI); + bool sinkStores(BasicBlock *Head, BasicBlock *Side, BasicBlock *Tail); + void getSafeStores(DominatorTree &DT); }; } // end anonymous namespace @@ -148,6 +194,35 @@ return Tail && Tail == Right->getSingleSuccessor(); } +/// +/// True when BB is the head of a triangle +/// +bool MergedLoadStoreMotion::isTriangleHead(BasicBlock *BB, BasicBlock *&Side, + BasicBlock *&Tail) { + if (!BB) + return false; + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (Succ0->hasNPredecessors(1)) { + Side = Succ0; + Tail = Succ1; + return Side->getSingleSuccessor() == Tail; + } + + if (Succ1->hasNPredecessors(1)) { + Side = Succ1; + Tail = Succ0; + return Side->getSingleSuccessor() == Tail; + } + + return false; +} + /// /// True when instruction is a sink barrier for a store /// located in Loc @@ -320,19 +395,235 @@ return {MergedStores, SplitFooter}; } -std::pair MergedLoadStoreMotion::run(Function &F, - AliasAnalysis &AA) { +bool MergedLoadStoreMotion::sinkStores(BasicBlock *Head, BasicBlock *Side, + BasicBlock *Tail) { + // Bail out early if we can not merge into the footer BB. + if (!SplitFooterBB && Tail->hasNPredecessorsOrMore(3)) + return false; + + // Don't sink if there are more than two (terminator and store) instructions + // in the block. + // TODO: Should we try to hoist at least the pointer operand? + Instruction *I = Side->getTerminator()->getPrevNonDebugInstruction(true); + if (I == nullptr) + return false; + if (Instruction *P = I->getPrevNonDebugInstruction(true)) + return false; + + // Don't sink non-simple (atomic, volatile) stores. + if (!isa(I)) + return false; + auto *S = cast(I); + if (!S->isSimple()) + return false; + + // Check it's legal to insert load along an alternative path and to move the + // store. + if (!SafeStores.contains(S)) + return false; + + BasicBlock *NewSide = nullptr; + BasicBlock *Sink = Tail; + + // Create a load on the "other" side of the branch. + if (NewSide == nullptr) + NewSide = SplitBlockPredecessors(Tail, {Head}, ".alt"); + Value *V = S->getValueOperand(); + auto *L = + new LoadInst(V->getType(), S->getPointerOperand(), V->getName() + ".old", + &*NewSide->getFirstInsertionPt()); + + // Sink the store to the tail block. + if (Sink != Tail && Tail->hasNPredecessorsOrMore(3)) + Sink = SplitBlockPredecessors(Tail, {Head, Side}, ".sink"); + + LLVM_DEBUG(dbgs() << "Sink single store into"; + Sink->printAsOperand(dbgs(), false); dbgs() << "Store: "; + S->dump(); dbgs() << "\n"; dbgs() << "Load: "; L->dump(); + dbgs() << "\n"); + S->removeFromParent(); + S->insertBefore(&*Sink->getFirstInsertionPt()); + + auto Phi = + PHINode::Create(V->getType(), 2, V->getName() + ".new", &Sink->front()); + Phi->addIncoming(V, Side); + Phi->addIncoming(L, NewSide); + S->setOperand(0, Phi); + + return true; +} + +/// +/// Check whether the address of a local variable may escape from the function. +/// +bool MergedLoadStoreMotion::isEscaping(const AllocaInst *AI) { + SmallPtrSet Visited; + SmallVector Worklist; + + Worklist.push_back(AI); + while (!Worklist.empty()) { + const Value *V = Worklist.pop_back_val(); + for (const User *U : V->users()) { + const Instruction *I = cast(U); + switch (I->getOpcode()) { + default: + // If not handled in one of the other cases, conservatively assume the + // value escapes. + return true; + case Instruction::Ret: + case Instruction::Load: + case Instruction::AtomicRMW: + // A pointer cannot escape via these instructions. + break; + case Instruction::Call: { + const auto *CI = cast(I); + if (CI->isDebugOrPseudoInst() || CI->isLifetimeStartOrEnd()) + break; + } + LLVM_FALLTHROUGH; + case Instruction::Invoke: + case Instruction::CallBr: + if (any_of(cast(I)->data_ops(), + [=](const Use &Arg) { return Arg.get() == V; })) + return true; + break; + case Instruction::Store: + if (AI == cast(I)->getValueOperand()) + return true; + break; + case Instruction::AtomicCmpXchg: + if (V == cast(I)->getNewValOperand()) + return true; + break; + case Instruction::PHI: + if (!Visited.insert(cast(I)).second) + break; + LLVM_FALLTHROUGH; + case Instruction::GetElementPtr: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::Select: + Worklist.push_back(I); + break; + } + } + } + + return false; +} + +/// +/// Determine the set of "non-trapping" store instructions. +/// +/// The criteria for "non-trapping" are: +/// a) The store must be dominated by a store to or, additionally, for local +/// objects only, by a load from the same memory location. The +/// stack is considered always writable, therefore we allow a load to +/// determine the access to the memory location cannot trap. +/// b) For local objects, the address of the object may not escape from the +/// function along any path in the CFG leading from the `alloca` +/// instruction to the candidate store. For such objects, the conditional +/// store replacement that we do will not create a data race. +/// c) For non-local objects, there may not be function calls (except to +/// `readnone`/`readonly`/`nofree` functions or certain library functions +/// with known semantics) along any path in the CFG leading from the +/// dominating store to the candidate store. +/// +/// The function checks only for access to local objects (for the time being +/// we don't do conditional replacement on stores to non-locals) and the +/// escape check is conservatively global to the function. +/// The function performs a depth-first traversal of the domination tree, +/// maintaining in a stack-like fashion the set of all interesting potentially +/// trapping memory accesses on the path in the DOM tree from the function +/// entry to the current basic block. The stores in the current block are +/// checked for a dominating access to the same memory location and then are +/// added to the set. +/// +void MergedLoadStoreMotion::getSafeStores(DominatorTree &DT) { + // Type used for maintaining conceptually a stack of hash tables. + using UnsafeLocationsTy = ScopedHashTable; + + // Entry of the current path in the DOM tree. + struct PathEntry { + PathEntry(DomTreeNode *N, UnsafeLocationsTy &HT) : Node(N) { + Scope.reset(new UnsafeLocationsTy::ScopeTy(HT)); + } + DomTreeNode *Node; + std::unique_ptr Scope; + }; + + UnsafeLocationsTy Unsafe; // Set of potentially trapping locations. + SmallVector Path; // DOM tree path + + LLVM_DEBUG(dbgs() << "Safe/unsafe access DOM traversal\n";); + + SmallVector Worklist; + Worklist.push_back(DT.getRootNode()); + while (!Worklist.empty()) { + DomTreeNode *Curr = Worklist.back(); + + if (Path.size() && Curr == Path.back().Node) { + // If the next node in the workist is the same as the last node on the + // path it means we have finished traversing the node and all its + // children. + LLVM_DEBUG(auto *B = Curr->getBlock(); dbgs() << "finish "; + B->printAsOperand(dbgs(), false); dbgs() << '\n';); + Worklist.pop_back(); + Path.pop_back(); + continue; + } + + // Enter a new basic block and start a new scope in the hash table. + Path.emplace_back(Curr, Unsafe); + LLVM_DEBUG(auto *B = Curr->getBlock(); dbgs() << "enter "; + B->printAsOperand(dbgs(), false); dbgs() << '\n';); + + // Scan the instructions in the block and examine the loads and the stores. + for (Instruction &I : *Curr->getBlock()) { + if (auto *S = dyn_cast(&I)) { + const auto *AI = + dyn_cast(getUnderlyingObject(S->getPointerOperand())); + if (AI == nullptr || isEscaping(AI)) + continue; + if (Unsafe.count(MemoryLocation::get(S))) { + LLVM_DEBUG(dbgs() << "Found safe store: "; I.dump();); + SafeStores.insert(&I); + } else { + LLVM_DEBUG(dbgs() << "Found unsafe store: "; I.dump();); + Unsafe.insert(MemoryLocation::get(S), true); + } + } else if (auto *L = dyn_cast(&I)) { + const auto *AI = + dyn_cast(getUnderlyingObject(L->getPointerOperand())); + if (AI == nullptr || isEscaping(AI)) + continue; + if (!Unsafe.count(MemoryLocation::get(L))) { + LLVM_DEBUG(dbgs() << "Found unsafe load: "; I.dump();); + Unsafe.insert(MemoryLocation::get(L), true); + } + } + } + + // Traverse the children. + for (auto *C : Curr->children()) + Worklist.push_back(C); + } +} + +std::pair MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA, + DominatorTree &DT) { this->AA = &AA; bool Changed = false; bool CFGChanged = false; LLVM_DEBUG(dbgs() << "Instruction Merger\n"); + getSafeStores(DT); // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. - // This loop doesn't care about newly inserted/split blocks - // since they never will be diamond heads. + // This loop doesn't care about newly inserted/split blocks + // since they will never be diamond or triangle heads. for (BasicBlock &BB : make_early_inc_range(F)) { BasicBlock *Left, *Right, *Tail; if (isDiamondHead(&BB, Left, Right, Tail)) { @@ -341,8 +632,14 @@ std::tie(Change, ChangeCFG) = mergeStores(&BB, Left, Right, Tail); Changed |= Change; CFGChanged |= ChangeCFG; + } else if (isTriangleHead(&BB, Left, Tail)) { + // Replace a conditional store. + if (sinkStores(&BB, Left, Tail)) + Changed = CFGChanged = true; } } + + SafeStores.clear(); return {Changed, CFGChanged}; } @@ -363,17 +660,18 @@ bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; + MergedLoadStoreMotion Impl(SplitFooterBB); bool Change, _; std::tie(Change, _) = - Impl.run(F, getAnalysis().getAAResults()); + Impl.run(F, getAnalysis().getAAResults(), + getAnalysis().getDomTree()); return Change; } private: void getAnalysisUsage(AnalysisUsage &AU) const override { - if (!SplitFooterBB) - AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); } @@ -392,6 +690,7 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -399,8 +698,10 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { MergedLoadStoreMotion Impl(Options.SplitFooterBB); auto &AA = AM.getResult(F); + auto &DT = AM.getResult(F); + bool Change, CFGChange; - std::tie(Change, CFGChange) = Impl.run(F, AA); + std::tie(Change, CFGChange) = Impl.run(F, AA, DT); if (!Change && !CFGChange) return PreservedAnalyses::all(); Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -490,7 +490,10 @@ ; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: MergedLoadStoreMotion +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Phi Values Analysis +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory Dependence Analysis ; GCN-O2-NEXT: Lazy Branch Probability Analysis @@ -849,7 +852,10 @@ ; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: MergedLoadStoreMotion +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Phi Values Analysis +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory Dependence Analysis ; GCN-O3-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-LTO-pipeline.ll =================================================================== --- llvm/test/Other/opt-LTO-pipeline.ll +++ llvm/test/Other/opt-LTO-pipeline.ll @@ -117,9 +117,12 @@ ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -137,7 +137,10 @@ ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -142,7 +142,10 @@ ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -142,7 +142,10 @@ ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -123,7 +123,10 @@ ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis