Index: llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp =================================================================== --- llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -14,7 +14,7 @@ // diamond (hammock) and merges them into a single store to the tail block // (footer). The algorithm iterates over the instructions of one side of the // diamond and attempts to find a matching store on the other side. New -// tail/footer block may be insterted if the tail/footer block has more +// tail/footer block may be inserted if the tail/footer block has more // predecessors (not only the two predecessors that are forming the diamond). It // sinks when it thinks it safe to do so. This optimization helps with eg. // hiding load latencies, triggering if-conversion, and reducing static code @@ -62,6 +62,41 @@ // <...> // // +// This pass also replaces conditional stores with unconditional ones (subject to +// constraints). Given a triangle shape before the transformation: +// +// header: +// br %cond, label %if.then, label %if.end +// + + +// + + +// + + +// if.then: + +// store %s, %addr + +// br label %if.end + +// + + +// + + +// + + +// if.end ("footer"): +// <...> +// +// After the conditional store replacement, this becomes: +// header: +// br %cond, label %if.then, label %if.else +// + + +// + + +// + + +// if.then: if.else: +// br label %if.end load %s.old, %addr +// br label %if.end +// + + +// + + +// + + +// if.end ("footer"): +// %s.new = phi [%s, if.then], [%s.old, if.else] +// <...> +// store %s.new, %addr_s +// <...> +// //===----------------------- TODO -----------------------------------------===// // // 1) Generalize to regions other than diamonds @@ -76,6 +111,7 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" @@ -94,6 +130,8 @@ //===----------------------------------------------------------------------===// class MergedLoadStoreMotion { AliasAnalysis *AA = nullptr; + MemorySSA *MSSA = nullptr; + DominatorTree *DT = nullptr; // The mergeStores algorithms could have Size0 * Size1 complexity, // where Size0 and Size1 are the number instructions on the two sides of @@ -104,11 +142,13 @@ const bool SplitFooterBB; public: MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} - std::pair run(Function &F, AliasAnalysis &AA); + std::pair run(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, + DominatorTree &DT); private: bool isDiamondHead(BasicBlock *BB, BasicBlock *&Left, BasicBlock *&Right, BasicBlock *&Tail); + bool isTriangleHead(BasicBlock *BB, BasicBlock *&Side, BasicBlock *&Tail); // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); @@ -118,7 +158,13 @@ void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); std::pair mergeStores(BasicBlock *Head, BasicBlock *Left, - BasicBlock *Right, BasicBlock *Tail); + BasicBlock *Right, BasicBlock *Tail, + DomTreeUpdater &DTU); + void sinkStore(StoreInst *, DomTreeUpdater &DTU); + std::pair checkEscapeAndDomination(const StoreInst *S, + const AllocaInst *AI); + bool isNonTrappingStore(const StoreInst *S, bool IsLocal, bool IsEscaping); + bool isLegalToReplaceStore(const StoreInst *S); }; } // end anonymous namespace @@ -148,6 +194,35 @@ return Tail && Tail == Right->getSingleSuccessor(); } +/// +/// True when BB is the head of a triangle +/// +bool MergedLoadStoreMotion::isTriangleHead(BasicBlock *BB, BasicBlock *&Side, + BasicBlock *&Tail) { + if (!BB) + return false; + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + BasicBlock *Succ0 = BI->getSuccessor(0); + BasicBlock *Succ1 = BI->getSuccessor(1); + + if (Succ0->hasNPredecessors(1)) { + Side = Succ0; + Tail = Succ1; + return Side->getSingleSuccessor() == Tail; + } + + if (Succ1->hasNPredecessors(1)) { + Side = Succ1; + Tail = Succ0; + return Side->getSingleSuccessor() == Tail; + } + + return false; +} + /// /// True when instruction is a sink barrier for a store /// located in Loc @@ -270,7 +345,8 @@ std::pair MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB, BasicBlock *LeftBB, BasicBlock *RightBB, - BasicBlock *TailBB) { + BasicBlock *TailBB, + DomTreeUpdater &DTU) { bool MergedStores = false; bool SplitFooter = false; BasicBlock *SinkBB = TailBB; @@ -308,7 +384,8 @@ if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) { // We have more than 2 predecessors. Insert a new block // postdominating 2 predecessors we're going to sink from. - SinkBB = SplitBlockPredecessors(TailBB, {LeftBB, RightBB}, ".sink.split"); + SinkBB = SplitBlockPredecessors(TailBB, {LeftBB, RightBB}, ".sink.split", + &DTU); if (!SinkBB) break; SplitFooter = true; @@ -320,15 +397,246 @@ return {MergedStores, SplitFooter}; } -std::pair MergedLoadStoreMotion::run(Function &F, - AliasAnalysis &AA) { - this->AA = &AA; +/// +/// Check whether the address `AI` of a local variable may escape from the +/// function and whether a load from or a store to precisely the same +/// location as the store `S` properly dominates `S`. `S` must be a simple store +/// to the object, allocated by `AI`. Return the escape property in the first +/// element and the domination property in the second element of the pair. +/// +std::pair +MergedLoadStoreMotion::checkEscapeAndDomination(const StoreInst *S, + const AllocaInst *AI) { + bool Escape = false; + bool Dominate = false; + SmallPtrSet Visited; + SmallVector Worklist; + + Worklist.push_back(AI); + while (!(Escape && Dominate) && !Worklist.empty()) { + const Value *V = Worklist.pop_back_val(); + for (const User *U : V->users()) { + const Instruction *I = dyn_cast(U); + if (I == nullptr || I == S) + continue; + switch (I->getOpcode()) { + default: + // If not handled in one of the other cases, conservatively assume the + // value escapes. + Escape = true; + break; + case Instruction::Ret: + case Instruction::AtomicRMW: + // A pointer cannot escape via these instructions. + break; + case Instruction::Load: + if (!Dominate && + AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) == + AliasResult::MustAlias) + Dominate = DT->dominates(I, S); + break; + case Instruction::Call: { + const auto *CI = cast(I); + if (CI->isDebugOrPseudoInst() || CI->isLifetimeStartOrEnd()) + break; + } + LLVM_FALLTHROUGH; + case Instruction::Invoke: + case Instruction::CallBr: + if (any_of(cast(I)->data_ops(), + [=](const Use &Arg) { return Arg.get() == V; })) + Escape = true; + break; + case Instruction::Store: + if (V == cast(I)->getValueOperand()) + Escape = true; + if (!Dominate && + AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) == + AliasResult::MustAlias) + Dominate = DT->dominates(I, S); + break; + case Instruction::AtomicCmpXchg: + if (V == cast(I)->getNewValOperand()) + Escape = true; + break; + case Instruction::PHI: + if (!Visited.insert(cast(I)).second) + break; + LLVM_FALLTHROUGH; + case Instruction::GetElementPtr: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::Select: + Worklist.push_back(I); + break; + } + } + } - bool Changed = false; - bool CFGChanged = false; + return {Escape, Dominate}; +} + +bool MergedLoadStoreMotion::isNonTrappingStore(const StoreInst *S, bool IsLocal, + bool IsEscaping) { + SmallVector Worklist; + SmallPtrSet Visited; + + MemoryAccess *SDef = MSSA->getMemoryAccess(S); + Worklist.append(SDef->defs_begin(), SDef->defs_end()); + + while (!Worklist.empty()) { + const MemoryAccess *M = Worklist.pop_back_val(); + + // Can't guarantee anything if we reach the initial node. + if (MSSA->isLiveOnEntryDef(M)) + return false; + + // Continue traversing along the MemoryPhi operands. + if (const auto *Phi = dyn_cast(M)) { + if (Visited.insert(Phi).second) + Worklist.append(Phi->defs_begin(), Phi->defs_end()); + continue; + } + + assert(isa(M) && "Unexpected MemoryUse"); + const auto *MDef = cast(M); + Instruction *I = MDef->getMemoryInst(); + + // We can reach the same instruction (along a cyclic path), stop traversing + // at this point. + if (I == S) + continue; + + switch (I->getOpcode()) { + default: + // Unknown MemoryDef or an atomic instruction, assume invalidating and + // bail out. + return false; + case Instruction::Store: + // Ordered or volatile store, assume invalidating. + if (!cast(I)->isSimple()) + return false; + // A write to precisely the same location feeding into the candidate store + // "shields" it from traps along the dependency chain. + // A write to a separate or partially overlapping memory location does + // not guarantee the candidate store is non-trapping, but does not + // preclude it from being such either, so continue traversing up the + // dependency chain. + if (AA->alias(MemoryLocation::get(S), MemoryLocation::get(I)) != + AliasResult::MustAlias) + Worklist.push_back(MDef->getDefiningAccess()); + break; + case Instruction::Call: + if (!IsLocal || IsEscaping) { + const auto *CI = cast(I); + if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd()) + // TODO: Continue traversing across well-behaved calls: stdlib + // functions, readonly/readnone, etc. + return false; + } + Worklist.push_back(MDef->getDefiningAccess()); + break; + } + } + + return true; +} + +// Check if it's legal to replace a conditional store with a load and +// unconditional store. +bool MergedLoadStoreMotion::isLegalToReplaceStore(const StoreInst *S) { + if (!S->isSimple()) + return false; + bool IsLocal = false, IsEscaping = false, HasDominating = false; + if (auto *AI = + dyn_cast(getUnderlyingObject(S->getPointerOperand()))) { + IsLocal = true; + std::tie(IsEscaping, HasDominating) = checkEscapeAndDomination(S, AI); + } + if (IsLocal && !IsEscaping && HasDominating) + return true; + return isNonTrappingStore(S, IsLocal, IsEscaping); +} + +void MergedLoadStoreMotion::sinkStore(StoreInst *S, DomTreeUpdater &DTU) { + BasicBlock *Body = S->getParent(); + BasicBlock *Head = Body->getSinglePredecessor(); + BasicBlock *Tail = Body->getSingleSuccessor(); + assert(Head && Tail && "Invalid CFG shape for single store replacement"); + + // Create a load on the "other" side of the branch. + BasicBlock *NewBody = SplitBlockPredecessors(Tail, {Head}, ".for.load", &DTU); + Value *V = S->getValueOperand(); + auto *L = + new LoadInst(V->getType(), S->getPointerOperand(), V->getName() + ".old", + &*NewBody->getFirstInsertionPt()); + + // Sink the store to the tail block. + if (Tail->hasNPredecessorsOrMore(3)) + Tail = SplitBlockPredecessors(Tail, {Head, Body}, ".sink", &DTU); + + LLVM_DEBUG(dbgs() << "Sink single store into "; + Tail->printAsOperand(dbgs(), false); + dbgs() << "\nStore: "; S->dump(); + dbgs() << "Load: "; L->dump()); + S->removeFromParent(); + S->insertBefore(&*Tail->getFirstInsertionPt()); + + auto *Phi = + PHINode::Create(V->getType(), 2, V->getName() + ".new", &Tail->front()); + Phi->addIncoming(V, Body); + Phi->addIncoming(L, NewBody); + S->setOperand(0, Phi); +} + +std::pair MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA, + MemorySSA &MSSA, + DominatorTree &DT) { + this->AA = &AA; + this->MSSA = &MSSA; + this->DT = &DT; LLVM_DEBUG(dbgs() << "Instruction Merger\n"); + // Collect conditional stores which are legal to replace with unconditional + // ones. Do this before making any changes to the function, so we are using up + // to date dominator tree and Memory SSA graph. + SmallVector CondStores; + for (BasicBlock &BB : F) { + BasicBlock *IfBody, *IfTail; + if (!isTriangleHead(&BB, IfBody, IfTail)) + continue; + + // Have to insert a new tail, but it's disallowed. + if (!SplitFooterBB && IfTail->hasNPredecessorsOrMore(3)) + continue; + + // Don't sink if there are more than two (terminator and store) instructions + // in the block. + Instruction *I = IfBody->getTerminator()->getPrevNonDebugInstruction(true); + if (I == nullptr) + continue; + if (Instruction *P = I->getPrevNonDebugInstruction(true)) + continue; + + // Don't sink if it could introduce data race or invalid memory + // access to do so. + auto *S = dyn_cast(I); + if (S == nullptr || !isLegalToReplaceStore(S)) + continue; + + CondStores.push_back(S); + } + + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + + // Now that we have collected the stores, do the replacement. + for (StoreInst *S : CondStores) + sinkStore(S, DTU); + + bool CFGChanged = !CondStores.empty(); + bool Changed = CFGChanged; + // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. // This loop doesn't care about newly inserted/split blocks @@ -338,11 +646,12 @@ if (isDiamondHead(&BB, Left, Right, Tail)) { // Merge and sink store pairs outside diamonds when possible. bool Change, ChangeCFG; - std::tie(Change, ChangeCFG) = mergeStores(&BB, Left, Right, Tail); + std::tie(Change, ChangeCFG) = mergeStores(&BB, Left, Right, Tail, DTU); Changed |= Change; CFGChanged |= ChangeCFG; } } + return {Changed, CFGChanged}; } @@ -366,15 +675,17 @@ MergedLoadStoreMotion Impl(SplitFooterBB); bool Change, _; std::tie(Change, _) = - Impl.run(F, getAnalysis().getAAResults()); + Impl.run(F, getAnalysis().getAAResults(), + getAnalysis().getMSSA(), + getAnalysis().getDomTree()); return Change; } private: void getAnalysisUsage(AnalysisUsage &AU) const override { - if (!SplitFooterBB) - AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addPreserved(); } }; @@ -392,6 +703,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -399,12 +712,16 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { MergedLoadStoreMotion Impl(Options.SplitFooterBB); auto &AA = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); + auto &DT = AM.getResult(F); + bool Change, CFGChange; - std::tie(Change, CFGChange) = Impl.run(F, AA); - if (!Change && !CFGChange) + std::tie(Change, CFGChange) = Impl.run(F, AA, MSSA, DT); + if (!Change) return PreservedAnalyses::all(); PreservedAnalyses PA; + PA.preserve(); if (!CFGChange) PA.preserveSet(); return PA; Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -489,8 +489,12 @@ ; GCN-O2-NEXT: Unroll loops ; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: MergedLoadStoreMotion +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Phi Values Analysis +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Memory Dependence Analysis ; GCN-O2-NEXT: Lazy Branch Probability Analysis @@ -848,8 +852,12 @@ ; GCN-O3-NEXT: Unroll loops ; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: MergedLoadStoreMotion +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Phi Values Analysis +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Memory Dependence Analysis ; GCN-O3-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-LTO-pipeline.ll =================================================================== --- llvm/test/Other/opt-LTO-pipeline.ll +++ llvm/test/Other/opt-LTO-pipeline.ll @@ -117,9 +117,12 @@ ; CHECK-NEXT: Dead Store Elimination ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier ; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -136,8 +136,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -141,8 +141,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -141,8 +141,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -122,8 +122,12 @@ ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: SROA ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Memory SSA ; CHECK-NEXT: MergedLoadStoreMotion +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Phi Values Analysis +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Memory Dependence Analysis ; CHECK-NEXT: Lazy Branch Probability Analysis Index: llvm/test/Transforms/InstMerge/cond-store-elim.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstMerge/cond-store-elim.ll @@ -0,0 +1,460 @@ +; RUN: opt --passes=mldst-motion,verify -S %s -o - | FileCheck %s + +; Local store, dominated by a load, non-escaping +define i32 @f0(i64 %k, i32 %b) { +entry: + %a = alloca i64, align 8 + %tmpcast = bitcast i64* %a to [2 x i32]* + store i64 4294967296, i64* %a, align 8 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %0, %b + br i1 %cmp, label %if.then, label %if.end +; New control flow +; CHECK: br i1 %cmp, label %if.then, label %if.end.for.load + +if.then: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end +; Store moved out of the block +; CHECK: if.then: +; CHECK-NEXT: br label %if.end + +; Load inserted +; CHECK: if.end.for.load: +; CHECK-NEXT: %b.old = load i32, i32* %arrayidx, align 4 +; CHECK-NEXT: br label %if.end + +if.end: + %arrayidx2 = bitcast i64* %a to i32* + %1 = load i32, i32* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 1 + %2 = load i32, i32* %arrayidx3, align 4 + %add = add nsw i32 %2, %1 + ret i32 %add +; Store moved here +; CHECK: if.end: +; CHECK-NEXT: %b.new = phi i32 [ %b, %if.then ], [ %b.old, %if.end.for.load ] +; CHECK-NEXT: store i32 %b.new, i32* %arrayidx, align 4 +} + +; Local store, dominated by a load, escaping (missed optimisation) +define i32 @f1(i64 %k, i32 %b) { +entry: + %a = alloca i64, align 8 + %tmpcast = bitcast i64* %a to [2 x i32]* + store i64 4294967296, i64* %a, align 8 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %0, %b + br i1 %cmp, label %if.then, label %if.end +; CHECK: br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end +; CHECK: if.then: +; CHECK-NEXT: store i32 %b, i32* %arrayidx, align 4 + +if.end: + %arrayidx2 = bitcast i64* %a to i32* + %1 = load i32, i32* %arrayidx2, align 8 + %arrayidx3 = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 1 + %2 = load i32, i32* %arrayidx3, align 4 + %add = add nsw i32 %2, %1 +; Escape of the local variable here prevents the optimisation + call void @g(i64* %a) + ret i32 %add +} + +declare void @g(i64 *) + +; Local store, loads on all paths (missed optimisation) +define i32 @f2(i64 %k, i64 %j, i32 %b, i32 %c) { +entry: + %a = alloca i64, align 8 + %tmpcast = bitcast i64* %a to [2 x i32]* + store i64 4294967296, i64* %a, align 8 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 %k + %cmp = icmp eq i64 %k, %j + br i1 %cmp, label %if.then, label %if.else + +; MemorySSA can't be used to reach these two loads +if.then: + %0 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %1 = load i32, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %cmp7 = icmp slt i32 %b, %c + br i1 %cmp7, label %if.then8, label %if.end9 + +if.then8: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end9 +; CHECK: if.then8: +; CHECK-NEXT: store i32 %b, i32* %arrayidx, align 4 +; CHECK-NEXT: br label %if.end9 + +if.end9: + %arrayidx10 = bitcast i64* %a to i32* + %2 = load i32, i32* %arrayidx10, align 8 + %arrayidx11 = getelementptr inbounds [2 x i32], [2 x i32]* %tmpcast, i64 0, i64 1 + %3 = load i32, i32* %arrayidx11, align 4 + %add12 = add nsw i32 %3, %2 + ret i32 %add12 +} + +; Dominating non-local load, not optimised +define i32 @f3(i64 %k, i32 %b, i32* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %0, %b + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %b, i32* %arrayidx, align 4 + br label %if.end +; CHECK: if.then: +; CHECK-NEXT: store i32 %b, i32* %arrayidx, align 4 +; CHECK-NEXT: br label %if.end + +if.end: + %1 = load i32, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 1 + %2 = load i32, i32* %arrayidx3, align 4 + %add = add nsw i32 %2, %1 + ret i32 %add +} + +; Dominating non-local store +define i32 @f4(i64 %k, i64 %j, i32 %b, i32* %a) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + store i32 %0, i32* %arrayidx1, align 4 + %1 = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %1, %b + br i1 %cmp, label %if.then, label %if.end +; New control flow +; CHECK: br i1 %cmp, label %if.then, label %if.end.for.load + +if.then: + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end +; Store moved out +; CHECK: if.then: +; CHECK-NEXT: br label %if.end + +; Load inserted +; CHECK: if.end.for.load: +; CHECK-NEXT: %b.old = load i32, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %if.end + +if.end: + %2 = load i32, i32* %a, align 4 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 1 + %3 = load i32, i32* %arrayidx5, align 4 + %add = add nsw i32 %2, %3 + ret i32 %add +; Store moved here +; CHECK: if.end: +; CHECK-NEXT: %b.new = phi i32 [ %b, %if.then ], [ %b.old, %if.end.for.load ] +; CHECK-NEXT: store i32 %b.new, i32* %arrayidx1, align 4 +} + +; Non-local stores on all paths +define i32 @f5(i64 %k, i64 %j, i32 %b, i32* nocapture %a) #0 { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + %cmp = icmp slt i64 %k, %j + br i1 %cmp, label %if.then, label %if.else + +if.then: + %0 = load i32, i32* %arrayidx, align 4 +; This store ... + store i32 %0, i32* %arrayidx1, align 4 + br label %if.end + +if.else: + %add = add nsw i64 %j, %k + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %add + %1 = load i32, i32* %arrayidx2, align 4 +; ... and this one precede the condidate store in all executions. + store i32 %1, i32* %arrayidx1, align 4 + store i32 1, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + %2 = load i32, i32* %arrayidx, align 4 + %cmp5 = icmp sgt i32 %2, %b + br i1 %cmp5, label %if.then6, label %if.end8 +; New control flow +; CHECK: br i1 %cmp5, label %if.then6, label %if.end8.for.load + +if.then6: + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end8 +; Store moved out +; CHECK: if.then6: +; CHECK-NEXT: br label %if.end8 + +; Load inserted +; CHECK: if.end8.for.load: +; CHECK-NEXT: %b.old = load i32, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %if.end8 + +if.end8: + %3 = load i32, i32* %a, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %a, i64 1 + %4 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %4, %3 + ret i32 %add11 +; Store moved here +; CHECK: if.end8: +; CHECK-NEXT: %b.new = phi i32 [ %b, %if.then6 ], [ %b.old, %if.end8.for.load ] +; CHECK-NEXT: store i32 %b.new, i32* %arrayidx1, align +} + +; One path not fully "covered" +define i32 @f6(i64 %k, i64 %j, i32 %b, i32* nocapture %a) #0 { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + %cmp = icmp slt i64 %k, %j + %add = add nsw i64 %j, %k + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %add + br i1 %cmp, label %if.then, label %if.else + +if.then: + %0 = load i32, i32* %arrayidx, align 4 + store i32 %0, i32* %arrayidx1, align 4 + br label %if.end + +if.else: + %1 = load i32, i32* %arrayidx2, align 4 + store i32 %1, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %2 = load i32, i32* %arrayidx, align 4 + %cmp5 = icmp sgt i32 %2, %b + br i1 %cmp5, label %if.then6, label %if.end8 + +if.then6: + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end8 +; Optimisation did not fire: along entry -> if.else -> if.then6 there is +; no preceding store to the same location +; CHECK: if.then6: +; CHECK-NEXT: store i32 %b, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %if.end8 + +if.end8: + %3 = load i32, i32* %a, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %a, i64 1 + %4 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %4, %3 + ret i32 %add11 +} + +; "Invalidating" instruction along a path +define i32 @f7(i64 %k, i64 %j, i32 %b, i32* nocapture %a) #0 { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + %cmp = icmp slt i64 %k, %j + %add = add nsw i64 %j, %k + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %add + br i1 %cmp, label %if.then, label %if.else + +if.then: + %0 = load i32, i32* %arrayidx, align 4 + store i32 %0, i32* %arrayidx1, align 4 + ; The call to `h` invalidates whatever we can infer from the store above. + call void @h() + br label %if.end + +if.else: + %1 = load i32, i32* %arrayidx2, align 4 + store i32 %1, i32* %arrayidx1, align 4 + br label %if.end + +if.end: + %2 = load i32, i32* %arrayidx, align 4 + %cmp5 = icmp sgt i32 %2, %b + br i1 %cmp5, label %if.then6, label %if.end8 + +if.then6: + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end8 +; Optimisation did not fire. +; CHECK: if.then6: +; CHECK-NEXT: store i32 %b, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %if.end8 + +if.end8: + %3 = load i32, i32* %a, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %a, i64 1 + %4 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %4, %3 + ret i32 %add11 +} + +declare void @h() + +; Invalidating instruction along a cyclic path +@p = global i32* null, align 8 + +define i32 @f8(i64 %k, i64 %j, i32 %b, i32* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + store i32 %0, i32* %arrayidx1, align 4 + %cmp18 = icmp slt i64 %k, %j + br i1 %cmp18, label %for.body, label %for.end + +for.body: + %k.addr.019 = phi i64 [ %inc, %for.inc ], [ %k, %entry] + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %k.addr.019 + %1 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp sgt i32 %1, %b + br i1 %cmp3, label %if.then, label %for.inc + +if.then: + store i32 %b, i32* %arrayidx1, align 4 + br label %for.inc + +; Optimisation is invalid because of the volatile store below. +; CHECK: if.then: +; CHECK-NEXT: store i32 %b, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %for.inc + +for.inc: + %ptr = load i32 *, i32** @p, align 8 + store volatile i32 1, i32* %ptr, align 8 + %inc = add nsw i64 %k.addr.019, 1 + %exitcond = icmp ne i64 %inc, %j + br i1 %exitcond, label %for.body, label %for.end + +for.end: + %2 = load i32, i32* %a, align 4 + %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 1 + %3 = load i32, i32* %arrayidx6, align 4 + %add = add nsw i32 %3, %2 + ret i32 %add +} + +; Non-local stores on all paths (cyclic version) +define i32 @f9(i64 %k, i64 %j, i32 %b, i32* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %k + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %j + store i32 %0, i32* %arrayidx1, align 4 + %cmp18 = icmp slt i64 %k, %j + br i1 %cmp18, label %for.body, label %for.end + +for.body: + %k.addr.019 = phi i64 [ %inc, %for.inc ], [ %k, %entry] + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %k.addr.019 + %1 = load i32, i32* %arrayidx2, align 4 + %cmp3 = icmp sgt i32 %1, %b + br i1 %cmp3, label %if.then, label %for.inc +; New control flow +; CHECK: br i1 %cmp3, label %if.then, label %for.inc.for.load + +if.then: + store i32 %b, i32* %arrayidx1, align 4 + br label %for.inc +; Store moved out +; CHECK: if.then: +; CHECK-NEXT: br label %for.inc + +; Load inserted +; CHECK: for.inc.for.load: +; CHECK-NEXT: %b.old = load i32, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %for.inc + +for.inc: + call void @h(); + store i32 %b, i32* %arrayidx1, align 4 + %inc = add nsw i64 %k.addr.019, 1 + %exitcond = icmp ne i64 %inc, %j + br i1 %exitcond, label %for.body, label %for.end +; Store moved here +; CHECK: for.inc: +; CHECK-NEXT: %b.new = phi i32 [ %b, %if.then ], [ %b.old, %for.inc.for.load ] +; CHECK-NEXT: store i32 %b.new, i32* %arrayidx1, align 4 + +for.end: + %2 = load i32, i32* %a, align 4 + %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 1 + %3 = load i32, i32* %arrayidx6, align 4 + %add = add nsw i32 %3, %2 + ret i32 %add +} + +; "Invalidating" instruction along a path, bit it does not invalidate local stores +define i32 @f10(i64 %k, i64 %j, i32 %b) #0 { +entry: + %a = alloca i64, align 8 + %tmpcast = bitcast i64* %a to [2 x i32]* + store i64 4294967296, i64* %a, align 8 + %p = bitcast i64* %a to i32* + %arrayidx = getelementptr inbounds i32, i32* %p, i64 %k + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 %j + %cmp = icmp slt i64 %k, %j + %add = add nsw i64 %j, %k + %arrayidx2 = getelementptr inbounds i32, i32* %p, i64 %add + br i1 %cmp, label %if.then, label %if.else + +if.then: + %0 = load i32, i32* %arrayidx, align 4 + store i32 %0, i32* %arrayidx1, align 4 + ; A local variables stays writeable regardless of calls to arbitrary functions. + ; If it's non-escaping no data races possible too. + call void @h() + br label %if.end + +if.else: + %1 = load i32, i32* %arrayidx2, align 4 + store i32 %1, i32* %arrayidx1, align 4 + br label %if.end + +if.end: + %2 = load i32, i32* %arrayidx, align 4 + %cmp5 = icmp sgt i32 %2, %b + br i1 %cmp5, label %if.then6, label %if.end8 +; CHECK: br i1 %cmp5, label %if.then6, label %if.end8.for.load + +if.then6: + store i32 %b, i32* %arrayidx1, align 4 + br label %if.end8 +; Store moved out +; CHECK: if.then6: +; CHECK-NEXT: br label %if.end8 + +; Load inserted +; CHECK: if.end8.for.load: +; CHECK-NEXT: %b.old = load i32, i32* %arrayidx1, align 4 +; CHECK-NEXT: br label %if.end8 + +if.end8: + %3 = load i32, i32* %p, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %p, i64 1 + %4 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %4, %3 + ret i32 %add11 +; Store moved here +; CHECK: if.end8: +; CHECK-NEXT: %b.new = phi i32 [ %b, %if.then6 ], [ %b.old, %if.end8.for.load ] +; CHECK-NEXT: store i32 %b.new, i32* %arrayidx1, align 4 +} Index: llvm/test/Transforms/InstMerge/st_sink_split_bb.ll =================================================================== --- llvm/test/Transforms/InstMerge/st_sink_split_bb.ll +++ llvm/test/Transforms/InstMerge/st_sink_split_bb.ll @@ -3,14 +3,12 @@ ; have more than 2 predecessors for the block we're going to sink to. ; RUN: opt -basic-aa -memdep -mldst-motion -S < %s | FileCheck %s --check-prefix=CHECK-NO ; RUN: opt -debug-pass-manager -aa-pipeline=basic-aa -passes='require,mldst-motion' -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NO,CHECK-INV-NO -; RUN: opt -debug-pass-manager -aa-pipeline=basic-aa -passes='require,mldst-motion' -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-YES,CHECK-INV-YES +; RUN: opt -debug-pass-manager -aa-pipeline=basic-aa -passes='require,mldst-motion' -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-YES,CHECK-INV-NO target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; When passing split-footer-bb to MLSM, it invalidates CFG analyses ; CHECK-INV-NO: Running pass: MergedLoadStoreMotionPass ; CHECK-INV-NO-NOT: Invalidating analysis: DominatorTreeAnalysis -; CHECK-INV-YES: Running pass: MergedLoadStoreMotionPass -; CHECK-INV-YES: Invalidating analysis: DominatorTreeAnalysis ; Function Attrs: nounwind uwtable define dso_local void @st_sink_split_bb(i32* nocapture %arg, i32* nocapture %arg1, i1 zeroext %arg2, i1 zeroext %arg3) local_unnamed_addr {