Index: llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -14,7 +14,7 @@ // diamond (hammock) and merges them into a single store to the tail block // (footer). The algorithm iterates over the instructions of one side of the // diamond and attempts to find a matching store on the other side. New -// tail/footer block may be insterted if the tail/footer block has more +// tail/footer block may be inserted if the tail/footer block has more // predecessors (not only the two predecessors that are forming the diamond). It // sinks when it thinks it safe to do so. This optimization helps with eg. // hiding load latencies, triggering if-conversion, and reducing static code @@ -62,6 +62,41 @@ // <...> // // +// This pass also replaces conditional stores with unconditional ones (subject to +// constraints). Given a triangle shape before the transformation: +// +// header: +// br %cond, label %if.then, label %if.end +// + + +// + + +// + + +// if.then: + +// store %s, %addr + +// br label %if.end + +// + + +// + + +// + + +// if.end ("footer"): +// <...> +// +// After the conditional store replacement, this becomes: +// header: +// br %cond, label %if.then, label %if.else +// + + +// + + +// + + +// if.then: if.else: +// br label %if.end load %s.old, %addr +// br label %if.end +// + + +// + + +// + + +// if.end ("footer"): +// %s.new = phi [%s, if.then], [%s.old, if.else] +// <...> +// store %s.new, %addr_s +// <...> +// //===----------------------- TODO -----------------------------------------===// // // 1) Generalize to regions other than diamonds @@ -77,6 +112,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" @@ -94,6 +130,8 @@ //===----------------------------------------------------------------------===// class MergedLoadStoreMotion { AliasAnalysis *AA = nullptr; + MemorySSA *MSSA = nullptr; + DominatorTree *DT = nullptr; // The mergeStores algorithms could have Size0 * Size1 complexity, // where Size0 and Size1 are the number instructions on the two sides of @@ -104,11 +142,13 @@ const bool SplitFooterBB; public: MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} - std::pair run(Function &F, AliasAnalysis &AA); + std::pair run(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT); private: bool isDiamondHead(BasicBlock *BB, BasicBlock *&Left, BasicBlock *&Right, BasicBlock *&Tail); + bool isTriangleHead(BasicBlock *BB, BasicBlock *&Side, BasicBlock *&Tail); // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); @@ -119,6 +159,11 @@ StoreInst *ElseInst); std::pair mergeStores(BasicBlock *Head, BasicBlock *Left, BasicBlock *Right, BasicBlock *Tail); + void sinkStore(StoreInst *); + std::pair checkEscapeAndDomination(const StoreInst *S, + const AllocaInst *AI); + bool isNonTrappingStore(const StoreInst *S, bool IsLocal, bool IsEscaping); + bool isLegalToReplaceStore(const StoreInst *S); }; } // end anonymous namespace @@ -148,6 +193,35 @@ return Tail && Tail == Right->getSingleSuccessor(); } +/// +/// True when BB is the head of a triangle +/// +bool MergedLoadStoreMotion::isTriangleHead(BasicBlock *BB, BasicBlock *&Side, + BasicBlock *&Tail) { + if (!BB) + return false; + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (Succ0->hasNPredecessors(1)) { + Side = Succ0; + Tail = Succ1; + return Side->getSingleSuccessor() == Tail; + } + + if (Succ1->hasNPredecessors(1)) { + Side = Succ1; + Tail = Succ0; + return Side->getSingleSuccessor() == Tail; + } + + return false; +} + /// /// True when instruction is a sink barrier for a store /// located in Loc @@ -320,15 +394,243 @@ return {MergedStores, SplitFooter}; } -std::pair MergedLoadStoreMotion::run(Function &F, - AliasAnalysis &AA) { - this->AA = &AA; +/// +/// Check whether the address `AI` of a local variable may escape from the +/// function and whether a load from or a store to precisely the same +/// location as the store `S` properly dominates `S`. `S` must be a simple store +/// to the object, allocated by `AI`. Return the escape property in the first +/// element and the domination property in the second element of the pair. +/// +std::pair +MergedLoadStoreMotion::checkEscapeAndDomination(const StoreInst *S, + const AllocaInst *AI) { + bool Escape = false; + bool Dominate = false; + SmallPtrSet Visited; + SmallVector Worklist; + + Worklist.push_back(AI); + while (!(Escape && Dominate) && !Worklist.empty()) { + const Value *V = Worklist.pop_back_val(); + for (const User *U : V->users()) { + const Instruction *I = dyn_cast(U); + if (I == nullptr || I == S) + continue; + switch (I->getOpcode()) { + default: + // If not handled in one of the other cases, conservatively assume the + // value escapes. + Escape = true; + break; + case Instruction::Ret: + case Instruction::AtomicRMW: + // A pointer cannot escape via these instructions. + break; + case Instruction::Load: + if (!Dominate && + AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) == + AliasResult::MustAlias) + Dominate = DT->dominates(I, S); + break; + case Instruction::Call: { + const auto *CI = cast(I); + if (CI->isDebugOrPseudoInst() || CI->isLifetimeStartOrEnd()) + break; + } + LLVM_FALLTHROUGH; + case Instruction::Invoke: + case Instruction::CallBr: + if (any_of(cast(I)->data_ops(), + [=](const Use &Arg) { return Arg.get() == V; })) + Escape = true; + break; + case Instruction::Store: + if (V == cast(I)->getValueOperand()) + Escape = true; + if (!Dominate && + AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) == + AliasResult::MustAlias) + Dominate = DT->dominates(I, S); + break; + case Instruction::AtomicCmpXchg: + if (V == cast(I)->getNewValOperand()) + Escape = true; + break; + case Instruction::PHI: + if (!Visited.insert(cast(I)).second) + break; + LLVM_FALLTHROUGH; + case Instruction::GetElementPtr: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::Select: + Worklist.push_back(I); + break; + } + } + } - bool Changed = false; - bool CFGChanged = false; + return {Escape, Dominate}; +} + +bool MergedLoadStoreMotion::isNonTrappingStore(const StoreInst *S, bool IsLocal, + bool IsEscaping) { + SmallVector Worklist; + SmallPtrSet Visited; + + MemoryAccess *SDef = MSSA->getMemoryAccess(S); + Worklist.append(SDef->defs_begin(), SDef->defs_end()); + + while (!Worklist.empty()) { + const MemoryAccess *M = Worklist.pop_back_val(); + + // Can't guarantee anything if we reach the initial node. + if (MSSA->isLiveOnEntryDef(M)) + return false; + + // Continue traversing along the MemoryPhi operands. + if (const auto *Phi = dyn_cast(M)) { + if (Visited.insert(Phi).second) + Worklist.append(Phi->defs_begin(), Phi->defs_end()); + continue; + } + + assert(isa(M) && "Unexpected MemoryUse"); + const auto *MDef = cast(M); + Instruction *I = MDef->getMemoryInst(); + + // Prevent an instruction from "shielding" itself. + if (I == S) + continue; + + switch (I->getOpcode()) { + default: + // Unknown MemoryDef or an atomic instruction, assume invalidating and + // bail out. + return false; + case Instruction::Store: + // Ordered or volatile store, assume invalidating. + if (!cast(I)->isSimple()) + return false; + // A write to precisely the same location feeding into the candidate store + // "shields" it from traps along the dependency chain. + // A write to a separate or partially overlapping memory location does + // not guarantee the candidate store is non-trapping, but does not + // preclude it from being such either, so continue traversing up the + // dependency chain. + if (AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) != + AliasResult::MustAlias) + Worklist.push_back(MDef->getDefiningAccess()); + break; + case Instruction::Call: + if (!IsLocal || IsEscaping) { + const auto *CI = cast(I); + if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd()) + // TODO: Continue traversing across well-behaved calls: stdlib + // functions, readonly/readnone, etc. + return false; + } + Worklist.push_back(MDef->getDefiningAccess()); + break; + } + } + + return true; +} + +// Check if it's legal to replace a conditional store with a load and +// unconditional store. +bool MergedLoadStoreMotion::isLegalToReplaceStore(const StoreInst *S) { + if (!S->isSimple()) + return false; + bool IsLocal = false, IsEscaping = false, HasDominating = false; + if (auto *AI = + dyn_cast(getUnderlyingObject(S->getPointerOperand()))) { + IsLocal = true; + std::tie(IsEscaping, HasDominating) = checkEscapeAndDomination(S, AI); + } + if (IsLocal && !IsEscaping && HasDominating) + return true; + return isNonTrappingStore(S, IsLocal, IsEscaping); +} + +void MergedLoadStoreMotion::sinkStore(StoreInst *S) { + BasicBlock *Body = S->getParent(); + BasicBlock *Head = Body->getSinglePredecessor(); + BasicBlock *Tail = Body->getSingleSuccessor(); + assert(Head && Tail && "Invalid CFG shape for single store replacement"); + + // Create a load on the "other" side of the branch. + BasicBlock *NewBody = SplitBlockPredecessors(Tail, {Head}, ".alt"); + Value *V = S->getValueOperand(); + auto *L = + new LoadInst(V->getType(), S->getPointerOperand(), V->getName() + ".old", + &*NewBody->getFirstInsertionPt()); + + // Sink the store to the tail block. + if (Tail->hasNPredecessorsOrMore(3)) + Tail = SplitBlockPredecessors(Tail, {Head, Body}, ".sink"); + + LLVM_DEBUG(dbgs() << "Sink single store into"; + Tail->printAsOperand(dbgs(), false); + dbgs() << "Store: ";S->dump(); + dbgs() << "Load: "; L->dump()); + S->removeFromParent(); + S->insertBefore(&*Tail->getFirstInsertionPt()); + + auto *Phi = + PHINode::Create(V->getType(), 2, V->getName() + ".new", &Tail->front()); + Phi->addIncoming(V, Body); + Phi->addIncoming(L, NewBody); + S->setOperand(0, Phi); +} + +std::pair MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA, + MemorySSA &MSSA, + DominatorTree &DT) { + this->AA = &AA; + this->MSSA = &MSSA; + this->DT = &DT; LLVM_DEBUG(dbgs() << "Instruction Merger\n"); + // Collect conditional stores which are legal to replace with unconditional + // ones. Do this before making any changes to the function, so we are using up + // to date dominator tree and Memory SSA graph. + SmallVector CondStores; + for (BasicBlock &BB : F) { + BasicBlock *IfBody, *IfTail; + if (!isTriangleHead(&BB, IfBody, IfTail)) + continue; + + // Have to insert a new tail, but it's disallowed. + if (!SplitFooterBB && IfTail->hasNPredecessorsOrMore(3)) + continue; + + // Don't sink if there are more than two (terminator and store) instructions + // in the block. + Instruction *I = IfBody->getTerminator()->getPrevNonDebugInstruction(true); + if (I == nullptr) + continue; + if (Instruction *P = I->getPrevNonDebugInstruction(true)) + continue; + + // Don't sink if it could introduce data race or invalid memory + // access to do so. + auto *S = dyn_cast(I); + if (S == nullptr || !isLegalToReplaceStore(S)) + continue; + + CondStores.push_back(S); + } + + // Now that we have collected the stores, do the replacement. + for (StoreInst *S : CondStores) + sinkStore(S); + + bool CFGChanged = !CondStores.empty(); + bool Changed = false; + // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. // This loop doesn't care about newly inserted/split blocks @@ -343,6 +645,7 @@ CFGChanged |= ChangeCFG; } } + return {Changed, CFGChanged}; } @@ -366,15 +669,17 @@ MergedLoadStoreMotion Impl(SplitFooterBB); bool Change, _; std::tie(Change, _) = - Impl.run(F, getAnalysis().getAAResults()); + Impl.run(F, getAnalysis().getAAResults(), + getAnalysis().getMSSA(), + getAnalysis().getDomTree()); return Change; } private: void getAnalysisUsage(AnalysisUsage &AU) const override { - if (!SplitFooterBB) - AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); } }; @@ -392,6 +697,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -399,8 +706,11 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { MergedLoadStoreMotion Impl(Options.SplitFooterBB); auto &AA = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); + auto &DT = AM.getResult(F); + bool Change, CFGChange; - std::tie(Change, CFGChange) = Impl.run(F, AA); + std::tie(Change, CFGChange) = Impl.run(F, AA, MSSA, DT); if (!Change && !CFGChange) return PreservedAnalyses::all(); Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -489,8 +489,12 @@ ; GCN-O2-NEXT: Unroll loops ; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MergedLoadStoreMotion +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Phi Values Analysis +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory Dependence Analysis ; GCN-O2-NEXT: Lazy Branch Probability Analysis @@ -848,8 +852,12 @@ ; GCN-O3-NEXT: Unroll loops ; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MergedLoadStoreMotion +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Phi Values Analysis +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory Dependence Analysis ; GCN-O3-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-LTO-pipeline.ll =================================================================== --- llvm/test/Other/opt-LTO-pipeline.ll +++ llvm/test/Other/opt-LTO-pipeline.ll @@ -117,9 +117,12 @@ ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -136,8 +136,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -141,8 +141,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -141,8 +141,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -122,8 +122,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis