Index: lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- lib/Transforms/Scalar/EarlyCSE.cpp +++ lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -32,12 +33,17 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/MemorySSA.h" #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "early-cse" +static cl::opt EarlyCSEUseMemorySSA( + "early-cse-use-memoryssa", cl::init(false), cl::Hidden, + cl::desc("Use MemorySSA to check memory dependencies in EarlyCSE.")); + STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); STATISTIC(NumCSE, "Number of instructions CSE'd"); STATISTIC(NumCSECVP, "Number of compare instructions CVP'd"); @@ -251,6 +257,8 @@ const TargetTransformInfo &TTI; DominatorTree &DT; AssumptionCache &AC; + AliasAnalysis &AA; + MemorySSA &MSSA; typedef RecyclingAllocator< BumpPtrAllocator, ScopedHashTableVal> AllocatorTy; typedef ScopedHashTable, @@ -301,7 +309,8 @@ /// values. /// /// It uses the same generation count as loads. - typedef ScopedHashTable> CallHTType; + typedef ScopedHashTable> + CallHTType; CallHTType AvailableCalls; /// \brief This is the current generation of the memory value. @@ -309,8 +318,10 @@ /// \brief Set up the EarlyCSE runner for a particular function. EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, - DominatorTree &DT, AssumptionCache &AC) - : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + DominatorTree &DT, AssumptionCache &AC, AliasAnalysis &AA, + MemorySSA &MSSA) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), AA(AA), MSSA(MSSA), + CurrentGeneration(0) {} bool run(); @@ -479,9 +490,98 @@ return TTI.getOrCreateResultFromMemIntrinsic(cast(Inst), ExpectedType); } + + bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, + Instruction *EarlierInst, Instruction *LaterInst, + bool CanSkipReleaseFence); + + void removeMSSA(Instruction *Inst) { + if (!MSSA.isFinishedBuilding()) + return; + if (MemoryAccess *MA = MSSA.getMemoryAccess(Inst)) + MSSA.removeMemoryAccess(MA); + } }; } +static MemoryAccess *getHeapGeneration(MemoryAccess *MA, MemoryLocation &MemLoc, + bool CanSkipReleaseFence, + MemorySSA &MSSA, + MemorySSAWalker *MSSAWalker) { + if (!CanSkipReleaseFence) + return MA; + + if (MSSA.isLiveOnEntryDef(MA)) + return MA; + + auto *MD = dyn_cast(MA); + if (!MD) + return MA; + + Instruction *MDInst = MD->getMemoryInst(); + FenceInst *Fence = dyn_cast(MDInst); + if (!Fence) + return MA; + + if (Fence->getOrdering() != AtomicOrdering::Release) + return MA; + + return MSSAWalker->getClobberingMemoryAccess(MD->getDefiningAccess(), MemLoc); +} + +// Skip over fence release operations since we've weeded out instructions that +// can't migrate across a fence release already in the caller. +static MemoryAccess *getHeapGeneration(Instruction *Inst, + bool CanSkipReleaseFence, + MemorySSA &MSSA, + MemorySSAWalker *MSSAWalker) { + MemoryAccess *ClobberMA = MSSAWalker->getClobberingMemoryAccess(Inst); + // Calls don't have a distinct memory location. + // FIXME: This is too conservative for some load/store intrinsics or library + // calls. + if (ImmutableCallSite(Inst)) + return ClobberMA; + MemoryLocation InstMemLoc = MemoryLocation::get(Inst); + return getHeapGeneration(ClobberMA, InstMemLoc, CanSkipReleaseFence, MSSA, + MSSAWalker); +} + +// Determine if the memory referenced by LaterInst is from the same heap version +// as EarlierInst. +bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, + unsigned LaterGeneration, + Instruction *EarlierInst, + Instruction *LaterInst, + bool CanSkipReleaseFence) { + if (!EarlyCSEUseMemorySSA) + return EarlierGeneration == LaterGeneration; + + // Build MemorySSA if it hasn't already been built. + MemorySSAWalker *MSSAWalker = MSSA.buildMemorySSA(&AA, &DT); + + MemoryAccess *LaterHeapGen = + getHeapGeneration(LaterInst, CanSkipReleaseFence, MSSA, MSSAWalker); + + if (MSSA.isLiveOnEntryDef(LaterHeapGen)) + return true; + + MemoryAccess *EarlierMA = MSSA.getMemoryAccess(EarlierInst); + + // handle case of e.g. load atomic -> store unordered + if (LaterHeapGen == EarlierMA) + return true; + + MemoryAccess *EarlierHeapGen = + getHeapGeneration(EarlierInst, CanSkipReleaseFence, MSSA, MSSAWalker); + if (EarlierHeapGen == LaterHeapGen) + return true; + + // Make sure we haven't missed any cases that were caught using the old + // method. + assert(EarlierGeneration != LaterGeneration); + return false; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -539,6 +639,7 @@ // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -575,6 +676,10 @@ if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); + // This relies on SimplifyInstruction not removing any instructions that + // have MemoryAccesses. It may make them dead, in which case they will + // get removed in the code above and MemorySSA updated correctly. + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -589,6 +694,7 @@ if (auto *I = dyn_cast(V)) I->andIRFlags(Inst); Inst->replaceAllUsesWith(V); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSE; @@ -613,18 +719,21 @@ // If we have an available version of this load, and if it is the right // generation, replace this instruction. LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); - if (InVal.Inst != nullptr && InVal.Generation == CurrentGeneration && + if (InVal.Inst != nullptr && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing loads with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. - InVal.IsAtomic >= MemInst.isAtomic()) { + InVal.IsAtomic >= MemInst.isAtomic() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.Inst, + Inst, /*CanSkipReleaseFence=*/true)) { Value *Op = getOrCreateResult(InVal.Inst, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " << *InVal.Inst << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSELoad; @@ -654,12 +763,16 @@ if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. - std::pair InVal = AvailableCalls.lookup(Inst); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { + std::pair InVal = AvailableCalls.lookup(Inst); + if (InVal.first != nullptr && + isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first, + Inst, + /*CanSkipReleaseFence=*/true)) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -668,7 +781,7 @@ // Otherwise, remember that we have this instruction. AvailableCalls.insert( - Inst, std::pair(Inst, CurrentGeneration)); + Inst, std::pair(Inst, CurrentGeneration)); continue; } @@ -692,15 +805,18 @@ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); if (InVal.Inst && InVal.Inst == getOrCreateResult(Inst, InVal.Inst->getType()) && - InVal.Generation == CurrentGeneration && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. - !MemInst.isVolatile() && MemInst.isUnordered()) { + !MemInst.isVolatile() && MemInst.isUnordered() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, InVal.Inst, + Inst, + /*CanSkipReleaseFence=*/true)) { assert((!LastStore || ParseMemoryInst(LastStore, TTI).getPointerOperand() == MemInst.getPointerOperand()) && "can't have an intervening store!"); DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumDSE; @@ -732,6 +848,7 @@ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); + removeMSSA(LastStore); LastStore->eraseFromParent(); Changed = true; ++NumDSE; @@ -828,8 +945,10 @@ auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); + auto &AA = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, AA, MSSA); if (!CSE.run()) return PreservedAnalyses::all(); @@ -838,6 +957,7 @@ // FIXME: Bundle this with other CFG-preservation. PreservedAnalyses PA; PA.preserve(); + PA.preserve(); return PA; } @@ -865,8 +985,10 @@ auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); auto &AC = getAnalysis().getAssumptionCache(F); + auto &AA = getAnalysis().getAAResults(); + auto &MSSA = getAnalysis().getMSSA(); - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, AA, MSSA); return CSE.run(); } @@ -876,7 +998,10 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); } }; @@ -892,4 +1017,6 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSALazy) INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) Index: test/Transforms/EarlyCSE/memoryssa.ll =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/memoryssa.ll @@ -0,0 +1,15 @@ +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes=early-cse -early-cse-use-memoryssa | FileCheck %s + +@G1 = global i32 zeroinitializer +@G2 = global i32 zeroinitializer + +;; Simple load value numbering across non-clobbering store. +; CHECK-LABEL: @test1( +define i32 @test1() { + %V1 = load i32, i32* @G1 + store i32 0, i32* @G2 + %V2 = load i32, i32* @G1 + %Diff = sub i32 %V1, %V2 + ret i32 %Diff + ; CHECK: ret i32 0 +}