Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -43,6 +43,7 @@ #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/MemorySSA.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Vectorize.h" @@ -123,6 +124,7 @@ (void) llvm::createLowerExpectIntrinsicPass(); (void) llvm::createLowerInvokePass(); (void) llvm::createLowerSwitchPass(); + (void) llvm::createMemorySSAPass(); (void) llvm::createNaryReassociatePass(); (void) llvm::createObjCARCAAWrapperPass(); (void) llvm::createObjCARCAPElimPass(); Index: include/llvm/Transforms/Utils/MemorySSA.h =================================================================== --- include/llvm/Transforms/Utils/MemorySSA.h +++ include/llvm/Transforms/Utils/MemorySSA.h @@ -639,6 +639,13 @@ std::unique_ptr MSSA; }; +//===--------------------------------------------------------------------===// +// +// createMemorySSAPass - This pass builds memory SSA to allow walking memory +// instructions using a use/def graph. +// +FunctionPass *createMemorySSAPass(); + /// \brief This is the generic walker interface for walkers of MemorySSA. /// Walkers are used to be able to further disambiguate the def-use chains /// MemorySSA gives you, or otherwise produce better info than MemorySSA gives Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -36,6 +36,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/MemorySSA.h" #include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -229,6 +230,9 @@ MPM.add(createSROAPass()); else MPM.add(createScalarReplAggregatesPass(-1, false)); + if (OptLevel > 2) + // Add MemorySSA to enhance memory dependency analysis in EarlyCSE. + MPM.add(createMemorySSAPass()); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); Index: lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- lib/Transforms/Scalar/EarlyCSE.cpp +++ lib/Transforms/Scalar/EarlyCSE.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/MemorySSA.h" #include using namespace llvm; using namespace llvm::PatternMatch; @@ -251,6 +252,7 @@ const TargetTransformInfo &TTI; DominatorTree &DT; AssumptionCache &AC; + MemorySSA *MSSA; typedef RecyclingAllocator< BumpPtrAllocator, ScopedHashTableVal> AllocatorTy; typedef ScopedHashTable, @@ -310,8 +312,8 @@ /// \brief Set up the EarlyCSE runner for a particular function. EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, - DominatorTree &DT, AssumptionCache &AC) - : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {} bool run(); @@ -480,9 +482,95 @@ return TTI.getOrCreateResultFromMemIntrinsic(cast(Inst), ExpectedType); } + + bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, + Instruction *EarlierInst, Instruction *LaterInst); + bool isSameMemGenerationMemSSA(Instruction *EarlierInst, + Instruction *LaterInst); + + void removeMSSA(Instruction *Inst) { + if (!MSSA) + return; + if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) + MSSA->removeMemoryAccess(MA); + } }; } +/// Determine if the memory referenced by LaterInst is from the same heap version +/// as EarlierInst. +/// This is currently called in two scenarios: +/// +/// load p +/// ... +/// load p +/// +/// and +/// +/// x = load p +/// ... +/// store x, p +/// +/// in both cases we want to verify that there are no possible writes to the +/// memory referenced by p between the earlier and later instruction. +bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, + unsigned LaterGeneration, + Instruction *EarlierInst, + Instruction *LaterInst) { + // Check the simple memory generation tracking first. + // FIXME: There are cases that the current implementation of MemorySSA/BasicAA + // won't catch but the simple memory generation tracking will. One issue is + // that due to the decomposed GEP limit in BasicAA, some non-aliasing memory + // operations will show up as clobbers in MemorySSA due to BasicAA giving + // conservative results. + // FIXME: Another issue is the conservative way MemorySSA treats fence release + // instructions. These will appear as MemoryClobbers for unordered stores, + // even though there is no ordering relationship between these two operations. + + DEBUG(if (MSSA && + !isSameMemGenerationMemSSA(EarlierInst, LaterInst) && + EarlierGeneration == LaterGeneration) + dbgs() << "EarlyCSE: MemorySSA heap generation too conservative: " + << EarlierInst->getFunction()->getName() << "\n" + << " " << *EarlierInst << "\n" + << " " << *LaterInst << "\n";); + + if (EarlierGeneration == LaterGeneration) + return true; + + return isSameMemGenerationMemSSA(EarlierInst, LaterInst); +} + +bool EarlyCSE::isSameMemGenerationMemSSA(Instruction *EarlierInst, + Instruction *LaterInst) { + if (!MSSA) + return false; + + MemorySSAWalker *MSSAWalker = MSSA->getWalker(); + + MemoryAccess *LaterHeapGen = MSSAWalker->getClobberingMemoryAccess(LaterInst); + + if (MSSA->isLiveOnEntryDef(LaterHeapGen)) + return true; + + MemoryAccess *EarlierMA = MSSA->getMemoryAccess(EarlierInst); + + // Handle cases like this: + // x = load atomic, p + // ... (no aliasing memory ops) + // store unordered x, p + // In this case LaterHeapGen (i.e. the clobbering memory access for the store) + // is the atomic load (i.e. the EarlierInst MemoryAccess itself). + // FIXME: This may be better handled by having MemorySSA be less conservative + // when deciding if atomic loads should be clobbers or not. + if (LaterHeapGen == EarlierMA) + return true; + + MemoryAccess *EarlierHeapGen = + MSSAWalker->getClobberingMemoryAccess(EarlierInst); + return EarlierHeapGen == LaterHeapGen; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -540,6 +628,7 @@ // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -576,6 +665,11 @@ if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); Inst->replaceAllUsesWith(V); + // This relies on SimplifyInstruction not removing any instructions that + // have MemoryAccesses. It may make them dead, in which case they will + // get removed in the code above and MemorySSA updated correctly. + // FIXME: Perhaps MemorySSA should use ValueHandles? + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumSimplify; @@ -590,6 +684,7 @@ if (auto *I = dyn_cast(V)) I->andIRFlags(Inst); Inst->replaceAllUsesWith(V); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSE; @@ -614,18 +709,21 @@ // If we have an available version of this load, and if it is the right // generation, replace this instruction. LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); - if (InVal.DefInst != nullptr && InVal.Generation == CurrentGeneration && + if (InVal.DefInst != nullptr && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing loads with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. - InVal.IsAtomic >= MemInst.isAtomic()) { + InVal.IsAtomic >= MemInst.isAtomic() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst)) { Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " << *InVal.DefInst << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSELoad; @@ -656,11 +754,14 @@ // If we have an available version of this call, and if it is the right // generation, replace this instruction. std::pair InVal = AvailableCalls.lookup(Inst); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { + if (InVal.first != nullptr && + isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first, + Inst)) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumCSECall; @@ -693,15 +794,18 @@ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); if (InVal.DefInst && InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) && - InVal.Generation == CurrentGeneration && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. - !MemInst.isVolatile() && MemInst.isUnordered()) { + !MemInst.isVolatile() && MemInst.isUnordered() && + isSameMemGeneration(InVal.Generation, CurrentGeneration, + InVal.DefInst, Inst)) { assert((!LastStore || ParseMemoryInst(LastStore, TTI).getPointerOperand() == - MemInst.getPointerOperand()) && - "can't have an intervening store!"); + MemInst.getPointerOperand() || + MSSA) && + "can't have an intervening store if not using MemorySSA!"); DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; ++NumDSE; @@ -733,6 +837,7 @@ if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); + removeMSSA(LastStore); LastStore->eraseFromParent(); Changed = true; ++NumDSE; @@ -829,8 +934,9 @@ auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); + auto *MSSA = AM.getCachedResult(F); - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, MSSA); if (!CSE.run()) return PreservedAnalyses::all(); @@ -840,6 +946,7 @@ PreservedAnalyses PA; PA.preserve(); PA.preserve(); + PA.preserve(); return PA; } @@ -867,8 +974,10 @@ auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); auto &AC = getAnalysis().getAssumptionCache(F); + auto *MSSAPass = getAnalysisIfAvailable(); + auto *MSSA = MSSAPass ? &MSSAPass->getMSSA() : nullptr; - EarlyCSE CSE(TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC, MSSA); return CSE.run(); } @@ -878,7 +987,9 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addUsedIfAvailable(); AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); } }; Index: lib/Transforms/Utils/MemorySSA.cpp =================================================================== --- lib/Transforms/Utils/MemorySSA.cpp +++ lib/Transforms/Utils/MemorySSA.cpp @@ -755,6 +755,8 @@ MSSA->print(OS); } +FunctionPass *createMemorySSAPass() { return new MemorySSAWrapperPass(); } + MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {} CachingMemorySSAWalker::CachingMemorySSAWalker(MemorySSA *M, AliasAnalysis *A, Index: test/Transforms/EarlyCSE/AArch64/intrinsics.ll =================================================================== --- test/Transforms/EarlyCSE/AArch64/intrinsics.ll +++ test/Transforms/EarlyCSE/AArch64/intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s +; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -basicaa -memoryssa -early-cse | FileCheck %s ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { Index: test/Transforms/EarlyCSE/AArch64/ldstN.ll =================================================================== --- test/Transforms/EarlyCSE/AArch64/ldstN.ll +++ test/Transforms/EarlyCSE/AArch64/ldstN.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -early-cse < %s | FileCheck %s +; RUN: opt -S -basicaa -memoryssa -early-cse < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" Index: test/Transforms/EarlyCSE/atomics.ll =================================================================== --- test/Transforms/EarlyCSE/atomics.ll +++ test/Transforms/EarlyCSE/atomics.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -early-cse | FileCheck %s +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s ; CHECK-LABEL: @test12( define i32 @test12(i1 %B, i32* %P1, i32* %P2) { Index: test/Transforms/EarlyCSE/basic.ll =================================================================== --- test/Transforms/EarlyCSE/basic.ll +++ test/Transforms/EarlyCSE/basic.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -early-cse | FileCheck %s +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s ; RUN: opt < %s -S -passes=early-cse | FileCheck %s declare void @llvm.assume(i1) nounwind Index: test/Transforms/EarlyCSE/commute.ll =================================================================== --- test/Transforms/EarlyCSE/commute.ll +++ test/Transforms/EarlyCSE/commute.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -S -early-cse | FileCheck %s +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s ; CHECK-LABEL: @test1( define void @test1(float %A, float %B, float* %PA, float* %PB) { Index: test/Transforms/EarlyCSE/fence.ll =================================================================== --- test/Transforms/EarlyCSE/fence.ll +++ test/Transforms/EarlyCSE/fence.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -early-cse < %s | FileCheck %s +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s ; NOTE: This file is testing the current implementation. Some of ; the transforms used as negative tests below would be legal, but ; only if reached through a chain of logic which EarlyCSE is incapable Index: test/Transforms/EarlyCSE/guards.ll =================================================================== --- test/Transforms/EarlyCSE/guards.ll +++ test/Transforms/EarlyCSE/guards.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -early-cse < %s | FileCheck %s +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s declare void @llvm.experimental.guard(i1,...) Index: test/Transforms/EarlyCSE/memoryssa.ll =================================================================== --- /dev/null +++ test/Transforms/EarlyCSE/memoryssa.ll @@ -0,0 +1,33 @@ +; RUN: opt < %s -S -early-cse | FileCheck %s --check-prefix=CHECK-NOMEMSSA +; RUN: opt < %s -S -basicaa -memoryssa -early-cse | FileCheck %s +; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='require,early-cse' | FileCheck %s + +@G1 = global i32 zeroinitializer +@G2 = global i32 zeroinitializer + +;; Simple load value numbering across non-clobbering store. +; CHECK-LABEL: @test1( +; CHECK-NOMEMSSA-LABEL: @test1( +define i32 @test1() { + %V1 = load i32, i32* @G1 + store i32 0, i32* @G2 + %V2 = load i32, i32* @G1 + ; CHECK-NOMEMSSA: sub i32 %V1, %V2 + %Diff = sub i32 %V1, %V2 + ret i32 %Diff + ; CHECK: ret i32 0 +} + +;; Simple dead store elimination across non-clobbering store. +; CHECK-LABEL: @test2( +; CHECK-NOMEMSSA-LABEL: @test2( +define void @test2() { +entry: + %V1 = load i32, i32* @G1 + ; CHECK: store i32 0, i32* @G2 + store i32 0, i32* @G2 + ; CHECK-NOT: store + ; CHECK-NOMEMSSA: store i32 %V1, i32* @G1 + store i32 %V1, i32* @G1 + ret void +}