Index: lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- lib/Transforms/Scalar/EarlyCSE.cpp +++ lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -32,12 +33,17 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/MemorySSA.h" #include using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "early-cse" +static cl::opt EarlyCSEUseMemorySSA( + "early-cse-use-memoryssa", cl::init(false), cl::Hidden, + cl::desc("Use MemorySSA to check memory dependencies in EarlyCSE.")); + STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); STATISTIC(NumCSE, "Number of instructions CSE'd"); STATISTIC(NumCSECVP, "Number of compare instructions CVP'd"); @@ -251,6 +257,8 @@ const TargetTransformInfo &TTI; DominatorTree &DT; AssumptionCache &AC; + AliasAnalysis &AA; + MemorySSA &MSSA; typedef RecyclingAllocator< BumpPtrAllocator, ScopedHashTableVal> AllocatorTy; typedef ScopedHashTable, @@ -310,8 +318,10 @@ /// \brief Set up the EarlyCSE runner for a particular function. EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, - DominatorTree &DT, AssumptionCache &AC) - : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + DominatorTree &DT, AssumptionCache &AC, AliasAnalysis &AA, + MemorySSA &MSSA) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), AA(AA), MSSA(MSSA), + CurrentGeneration(0) {} bool run(); @@ -480,9 +490,99 @@ return TTI.getOrCreateResultFromMemIntrinsic(cast(Inst), ExpectedType); } + + bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, + Instruction *EarlierInst, Instruction *LaterInst); + bool isSameMemGenerationMemSSA(Instruction *EarlierInst, + Instruction *LaterInst); + + void removeMSSA(Instruction *Inst) { + if (!MSSA.isFinishedBuilding()) + return; + if (MemoryAccess *MA = MSSA.getMemoryAccess(Inst)) + MSSA.removeMemoryAccess(MA); + } }; } +// Determine if the memory referenced by LaterInst is from the same heap version +// as EarlierInst. +// This is currently called in two scenarios: +// +// load p +// ... +// load p +// +// and +// +// x = load p +// ... +// store x, p +// +// in both cases we want to verify that there are no possible writes to the +// memory referenced by p between the earlier and later instruction. +bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, + unsigned LaterGeneration, + Instruction *EarlierInst, + Instruction *LaterInst) { + // Check the simple memory generation tracking first. + // FIXME: There are cases that the current implementation of MemorySSA/BasicAA + // won't catch but the simple memory generation tracking will. One issue is + // that due to the decomposed GEP limit in BasicAA, some non-aliasing memory + // operations will show up as clobbers in MemorySSA due to BasicAA giving + // conservative results. + // FIXME: Another issue is the conservative way MemorySSA treats fence release + // instructions. These will appear as MemoryClobbers for unordered stores, + // even though there is no ordering relationship between these two operations. + + DEBUG(if (EarlyCSEUseMemorySSA && + !isSameMemGenerationMemSSA(EarlierInst, LaterInst) && + EarlierGeneration == LaterGeneration) + dbgs() << "EarlyCSE: MemorySSA heap generation too conservative: " + << EarlierInst->getFunction()->getName() << "\n" + << " " << *EarlierInst << "\n" + << " " << *LaterInst << "\n";); + + if (EarlierGeneration == LaterGeneration) + return true; + + return isSameMemGenerationMemSSA(EarlierInst, LaterInst); +} + +bool EarlyCSE::isSameMemGenerationMemSSA(Instruction *EarlierInst, + Instruction *LaterInst) { + if (!EarlyCSEUseMemorySSA) + return false; + + // Build MemorySSA if it hasn't already been built. + MemorySSAWalker *MSSAWalker = MSSA.buildMemorySSA(&AA, &DT); + + MemoryAccess *LaterHeapGen = MSSAWalker->getClobberingMemoryAccess(LaterInst); + + if (MSSA.isLiveOnEntryDef(LaterHeapGen)) + return true; + + MemoryAccess *EarlierMA = MSSA.getMemoryAccess(EarlierInst); + + // Handle cases like this: + // x = load atomic, p + // ... (no aliasing memory ops) + // store unordered x, p + // In this case LaterHeapGen (i.e. the clobbering memory access for the store) + // is the atomic load (i.e. the EarlierInst MemoryAccess itself). + // FIXME: This may be better handled by having MemorySSA be less conservative + // when deciding if atomic loads should be clobbers or not. + if (LaterHeapGen == EarlierMA) + return true; + + MemoryAccess *EarlierHeapGen = + MSSAWalker->getClobberingMemoryAccess(EarlierInst); + if (EarlierHeapGen == LaterHeapGen) + return true; + + return false; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -540,6 +640,7 @@ // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -576,6 +677,11 @@ if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); + // This relies on SimplifyInstruction not removing any instructions that + // have MemoryAccesses. It may make them dead, in which case they will + // get removed in the code above and MemorySSA updated correctly. + // FIXME: Perhaps MemorySSA should use ValueHandles? + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -590,6 +696,7 @@ if (auto *I = dyn_cast(V)) I->andIRFlags(Inst); Inst->replaceAllUsesWith(V); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSE; @@ -614,18 +721,21 @@ // If we have an available version of this load, and if it is the right // generation, replace this instruction. LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); - if (InVal.DefInst != nullptr && InVal.Generation == CurrentGeneration && + if (InVal.DefInst != nullptr && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing loads with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. - InVal.IsAtomic >= MemInst.isAtomic()) { + InVal.IsAtomic >= MemInst.isAtomic() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst)) { Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " << *InVal.DefInst << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSELoad; @@ -656,11 +766,14 @@ // If we have an available version of this call, and if it is the right // generation, replace this instruction. std::pair InVal = AvailableCalls.lookup(Inst); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && + isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first, + Inst)) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -693,15 +806,17 @@ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); if (InVal.DefInst && InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) && - InVal.Generation == CurrentGeneration && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. - !MemInst.isVolatile() && MemInst.isUnordered()) { + !MemInst.isVolatile() && MemInst.isUnordered() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst)) { assert((!LastStore || ParseMemoryInst(LastStore, TTI).getPointerOperand() == MemInst.getPointerOperand()) && "can't have an intervening store!"); DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumDSE; @@ -733,6 +848,7 @@ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); + removeMSSA(LastStore); LastStore->eraseFromParent(); Changed = true; ++NumDSE; @@ -829,8 +945,10 @@ auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); + auto &AA = AM.getResult(F); + auto &MSSA = AM.getResult(F).getMSSA(); - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, AA, MSSA); if (!CSE.run()) return PreservedAnalyses::all(); @@ -839,6 +957,7 @@ // FIXME: Bundle this with other CFG-preservation. PreservedAnalyses PA; PA.preserve(); + PA.preserve(); return PA; } @@ -866,8 +985,10 @@ auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); auto &AC = getAnalysis().getAssumptionCache(F); + auto &AA = getAnalysis().getAAResults(); + auto &MSSA = getAnalysis().getMSSA(); - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, AA, MSSA); return CSE.run(); } @@ -877,7 +998,10 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); } }; @@ -893,4 +1017,6 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSALazy) INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false) Index: test/Transforms/EarlyCSE/basic.ll =================================================================== --- test/Transforms/EarlyCSE/basic.ll +++ test/Transforms/EarlyCSE/basic.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -S -early-cse | FileCheck %s -; RUN: opt < %s -S -passes=early-cse | FileCheck %s +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes=early-cse | FileCheck %s declare void @llvm.assume(i1) nounwind Index: test/Transforms/EarlyCSE/memoryssa.ll =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/memoryssa.ll @@ -0,0 +1,15 @@ +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes=early-cse -early-cse-use-memoryssa | FileCheck %s + +@G1 = global i32 zeroinitializer +@G2 = global i32 zeroinitializer + +;; Simple load value numbering across non-clobbering store. +; CHECK-LABEL: @test1( +define i32 @test1() { + %V1 = load i32, i32* @G1 + store i32 0, i32* @G2 + %V2 = load i32, i32* @G1 + %Diff = sub i32 %V1, %V2 + ret i32 %Diff + ; CHECK: ret i32 0 +}