Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -93,6 +93,7 @@ (void) llvm::createDomOnlyViewerPass(); (void) llvm::createDomViewerPass(); (void) llvm::createGCOVProfilerPass(); + (void) llvm::createPDSEPass(); (void) llvm::createPGOInstrumentationGenLegacyPass(); (void) llvm::createPGOInstrumentationUseLegacyPass(); (void) llvm::createPGOIndirectCallPromotionLegacyPass(); Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -147,6 +147,7 @@ bool LoadCombine; bool NewGVN; bool DisableGVNLoadPRE; + bool PDSE; bool VerifyInput; bool VerifyOutput; bool MergeFunctions; Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -75,6 +75,12 @@ //===----------------------------------------------------------------------===// // +// PDSE - This pass deletes both partially and fully redundant stores. +// +FunctionPass *createPDSEPass(); + +//===----------------------------------------------------------------------===// +// // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This // algorithm assumes instructions are dead until proven otherwise, which makes // it more successful are removing non-obviously dead instructions. Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -74,6 +74,9 @@ static cl::opt RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass")); +static cl::opt RunPDSE("enable-pdse", cl::init(false), cl::Hidden, + cl::desc("Run with PDSE instead of DSE.")); + static cl::opt RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization", cl::init(true), cl::Hidden, @@ -158,6 +161,7 @@ RerollLoops = RunLoopRerolling; LoadCombine = RunLoadCombine; NewGVN = RunNewGVN; + PDSE = RunPDSE; DisableGVNLoadPRE = false; VerifyInput = false; VerifyOutput = false; @@ -342,7 +346,8 @@ addExtensionsToPM(EP_Peephole, MPM); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(PDSE ? createPDSEPass() + : createDeadStoreEliminationPass()); // Delete dead stores MPM.add(createLICMPass()); addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -761,8 +766,8 @@ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. - // Nuke dead stores. - PM.add(createDeadStoreEliminationPass()); + PM.add(PDSE ? createPDSEPass() // Really nuke dead stores. + : createDeadStoreEliminationPass()); // Nuke dead stores. // More loops are countable; try to optimize them. PM.add(createIndVarSimplifyPass()); Index: lib/Transforms/Scalar/PDSE.cpp =================================================================== --- lib/Transforms/Scalar/PDSE.cpp +++ lib/Transforms/Scalar/PDSE.cpp @@ -37,41 +37,781 @@ // Partial Redundancy Elimination in SSA Form // https://doi.org/10.1145/319301.319348 // -// Differences between the papers and this implementation: -// - May-throw instructions count as killing occurrences in the factored -// redundancy graph of escaping stores; -// - TODO: Figure out partial overwrite tracking. +// - TODO: Handle partial overwrite tracking during the full redundancy +// elimination phase. // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/AssemblyAnnotationWriter.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/PDSE.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +#include #define DEBUG_TYPE "pdse" using namespace llvm; +STATISTIC(NumStores, "Number of stores deleted"); +STATISTIC(NumPartialReds, "Number of partial redundancies converted."); + static cl::opt PrintFRG("print-frg", cl::init(false), cl::Hidden, cl::desc("Print the factored redundancy graph of stores.")); namespace { -bool runPDSE(Function &F, AliasAnalysis &AA, const PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { - if (PrintFRG) { - DEBUG(dbgs() << "TODO: Print factored redundancy graph.\n"); - return false; - } else { - DEBUG(dbgs() << "Dummy PDSE pass.\n"); - } - return false; +// Representations of factored redundancy graph elements. +enum struct OccTy { + Real, + Lambda, +}; + +struct RealOcc; +struct LambdaOcc; + +// Indexes PDSE.Worklist. +using RedIdx = unsigned; + +struct Occurrence { + unsigned ID; + RedIdx Class; + // ^ Index of the redundancy class that this belongs to. + OccTy Type; + + const RealOcc *isReal() const { + return Type == OccTy::Real ? reinterpret_cast(this) + : nullptr; + } + + const LambdaOcc *isLambda() const { + return Type == OccTy::Lambda ? reinterpret_cast(this) + : nullptr; + } + + RealOcc *isReal() { + return Type == OccTy::Real ? reinterpret_cast(this) : nullptr; + } + + LambdaOcc *isLambda() { + return Type == OccTy::Lambda ? reinterpret_cast(this) + : nullptr; + } + + RedIdx setClass(RedIdx Class_) { return Class = Class_; } +}; + +struct RealOcc final : public Occurrence { + Instruction *Inst; + Occurrence *Def; + Optional KillLoc; + + RealOcc(unsigned ID, Instruction &I) + : Occurrence{ID, -1u, OccTy::Real}, Inst(&I), KillLoc(None) {} + + RealOcc(unsigned ID, Instruction &I, MemoryLocation &&KillLoc) + : Occurrence{ID, -1u, OccTy::Real}, Inst(&I), KillLoc(KillLoc) {} + + bool canDSE() const { + if (auto *SI = dyn_cast(Inst)) { + return SI->isUnordered(); + } else if (auto *MI = dyn_cast(Inst)) { + return !MI->isVolatile(); + } else { + llvm_unreachable("Unknown real occurrence type."); + } + } +}; + +struct RedClass; + +struct LambdaOcc final : public Occurrence { + struct Operand { + Occurrence *Inner; + + bool hasRealUse() const { return Inner->isReal(); } + + LambdaOcc *getLambda() { + return Inner->isReal() ? Inner->isReal()->isLambda() : Inner->isLambda(); + } + }; + + struct RealUse { + RealOcc *Occ; + BasicBlock *Pred; + + Instruction &getInst() { return *Occ->Inst; } + + const Instruction &getInst() const { return *Occ->Inst; } + }; + + BasicBlock *Block; + SmallVector Defs; + SmallVector NullDefs; + SmallVector Uses; + // ^ All uses that alias or kill this lambda's occurrence class. A necessary + // condition for this lambda to be up-safe is that all its uses are the same + // class. + SmallVector, 4> LambdaUses; + // ^ Needed by the lambda refinement phases `CanBeAnt` and `Earlier`. + + // Consult the Kennedy et al. paper for these. + bool UpSafe; + bool CanBeAnt; + bool Earlier; + + LambdaOcc(BasicBlock &Block, RedIdx Class) + : Occurrence{-1u, Class, OccTy::Lambda}, Block(&Block), Defs(), + NullDefs(), Uses(), LambdaUses(), UpSafe(true), CanBeAnt(true), + Earlier(true) {} + + void addUse(RealOcc &Occ, BasicBlock &Pred) { Uses.push_back({&Occ, &Pred}); } + + void addUse(LambdaOcc &L, Operand &Op) { LambdaUses.push_back({&L, &Op}); } + + LambdaOcc &addOperand(BasicBlock &Succ, Occurrence *ReprOcc) { + if (ReprOcc) { + Defs.push_back(Operand{ReprOcc}); + if (LambdaOcc *L = Defs.back().getLambda()) + L->addUse(*this, Defs.back()); + } else + NullDefs.push_back(&Succ); + return *this; + } + + void resetUpSafe() { UpSafe = false; } + + void resetCanBeAnt() { + CanBeAnt = false; + Earlier = false; + } + + void resetEarlier() { Earlier = false; } + + bool willBeAnt() const { return CanBeAnt && !Earlier; } + + static Value *getStoreOp(Instruction &I) { + if (auto *SI = dyn_cast(&I)) { + return SI->getValueOperand(); + } else if (auto *MI = dyn_cast(&I)) { + return MI->getValue(); + } else if (auto *MI = dyn_cast(&I)) { + return MI->getRawSource(); + } else { + llvm_unreachable("Unknown real occurrence type."); + } + } + + static Instruction &setStoreOp(Instruction &I, Value &V) { + if (auto *SI = dyn_cast(&I)) { + SI->setOperand(0, &V); + } else if (auto *MI = dyn_cast(&I)) { + MI->setValue(&V); + } else if (auto *MI = dyn_cast(&I)) { + MI->setSource(&V); + } else { + llvm_unreachable("Unknown real occurrence type."); + } + return I; + } + + // See if this lambda's _|_ operands can be filled in. This requires that + // all + // uses of this lambda are the same instruction type and DSE-able (e.g., not + // volatile). + Instruction *createInsertionOcc() { + if (willBeAnt() && !NullDefs.empty() && + all_of(Uses, [](const RealUse &Use) { return Use.Occ->canDSE(); })) { + if (Uses.size() == 1) { + // If there's only one use, PRE can happen even if volatile. + return Uses[0].getInst().clone(); + } else if (Uses.size() > 1) { + // The closest real occ users must have the same instruction type + auto Same = [&](const RealUse &Use) { + return Use.getInst().getOpcode() == Uses[0].getInst().getOpcode(); + }; + if (std::all_of(std::next(Uses.begin()), Uses.end(), Same)) { + assert(getStoreOp(Uses[0].getInst()) && "Expected store operand."); + PHINode *P = IRBuilder<>(Block, Block->begin()) + .CreatePHI(getStoreOp(Uses[0].getInst())->getType(), + Uses.size()); + for (RealUse &Use : Uses) + P->addIncoming(getStoreOp(Use.getInst()), Use.Pred); + return &setStoreOp(*Uses[0].getInst().clone(), *P); + } + } + } + return nullptr; + } + + raw_ostream &print(raw_ostream &, const SmallVectorImpl &) const; +}; + +// Factored redundancy graph representation for each maximal group of +// must-aliasing stores. +struct RedClass { + MemoryLocation Loc; + // ^ The memory location that each RealOcc mods and must-alias. + SmallVector Overwrites; + // ^ Indices of redundancy classes that can DSE this class. + SmallVector Interferes; + // ^ Indices of redundancy classes that alias this class. + bool Escapes; + // ^ Upon function unwind, can Loc escape? + bool Returned; + // ^ Is Loc returned by the function? + SmallVector Lambdas; + + RedClass(MemoryLocation Loc, bool Escapes, bool Returned) + : Loc(std::move(Loc)), Escapes(Escapes), Returned(Returned), Lambdas() {} + +private: + using LambdaStack = SmallVector; + + // All of the lambda occ refinement phases follow this depth-first structure + // to propagate some lambda flag from an initial set to the rest of the + // graph. + // Consult figures 8 and 10 of Kennedy et al. + void depthFirst(void (*push)(LambdaOcc &, LambdaStack &), + bool (*initial)(LambdaOcc &), + bool (*alreadyTraversed)(LambdaOcc &L)) { + LambdaStack Stack; + + for (LambdaOcc *L : Lambdas) + if (initial(*L)) + push(*L, Stack); + + while (!Stack.empty()) { + LambdaOcc &L = *Stack.pop_back_val(); + if (!alreadyTraversed(L)) + push(L, Stack); + } + } + + // If lambda P is repr occ to an operand of lambda Q and: + // - Q is up-unsafe (i.e., there is a reverse path from Q to function + // entry + // that doesn't cross any real occs of Q's class), and + // - there are no real occs from P to Q, + // then we can conclude that P is up-unsafe too. We use this to propagate + // up-unsafety to the rest of the FRG. + RedClass &propagateUpUnsafe() { + auto push = [](LambdaOcc &L, LambdaStack &Stack) { + for (LambdaOcc::Operand &Op : L.Defs) + if (LambdaOcc *L = Op.Inner->isLambda()) + Stack.push_back(L); + }; + auto initialCond = [](LambdaOcc &L) { return !L.UpSafe; }; + // If the top entry of the lambda stack is up-unsafe, then it and its + // operands already been traversed. + auto &alreadyTraversed = initialCond; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + + RedClass &computeCanBeAnt() { + auto push = [](LambdaOcc &L, LambdaStack &Stack) { + L.resetCanBeAnt(); + for (auto &LO : L.LambdaUses) + if (!LO.second->hasRealUse() && !LO.first->UpSafe && LO.first->CanBeAnt) + Stack.push_back(LO.first); + }; + auto initialCond = [](LambdaOcc &L) { + return !L.UpSafe && L.CanBeAnt && !L.NullDefs.empty(); + }; + auto alreadyTraversed = [](LambdaOcc &L) { return !L.CanBeAnt; }; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + + RedClass &computeEarlier() { + auto push = [](LambdaOcc &L, LambdaStack &Stack) { + L.resetEarlier(); + for (auto &LO : L.LambdaUses) + if (LO.first->Earlier) + Stack.push_back(LO.first); + }; + auto initialCond = [](LambdaOcc &L) { + return L.Earlier && any_of(L.Defs, [](const LambdaOcc::Operand &Op) { + return Op.hasRealUse(); + }); + }; + auto alreadyTraversed = [](LambdaOcc &L) { return !L.Earlier; }; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + +public: + RedClass &willBeAnt() { + return propagateUpUnsafe().computeCanBeAnt().computeEarlier(); + } +}; + +raw_ostream &LambdaOcc::print(raw_ostream &O, + const SmallVectorImpl &Worklist) const { + const MemoryLocation &Loc = Worklist[Class].Loc; + O << "Lambda @ " << Block->getName() << " (" << *Loc.Ptr << " x " << Loc.Size + << ") [" << (UpSafe ? "U " : "!U ") << (CanBeAnt ? "C " : "!C ") + << (Earlier ? "E " : "!E ") << (willBeAnt() ? "W" : "!W") << "]"; + return O; } +class EscapeTracker { + const DataLayout &DL; + const TargetLibraryInfo &TLI; + DenseSet NonEscapes; + DenseSet Returns; + +public: + bool escapesOnUnwind(const Value *V) { + if (NonEscapes.count(V)) + return false; + if (isa(V) || + (isAllocLikeFn(V, &TLI) && !PointerMayBeCaptured(V, false, true))) { + NonEscapes.insert(V); + return false; + } + return true; + } + + bool escapesOnUnwind(const MemoryLocation &Loc) { + return escapesOnUnwind(GetUnderlyingObject(Loc.Ptr, DL)); + } + + bool returned(const Value *V) const { return Returns.count(V); } + + bool returned(const MemoryLocation &Loc) const { + return returned(GetUnderlyingObject(Loc.Ptr, DL)); + } + + EscapeTracker(Function &F, const TargetLibraryInfo &TLI) + : DL(F.getParent()->getDataLayout()), TLI(TLI) { + // Record non-escaping args. + for (Argument &Arg : F.args()) + if (Arg.hasByValOrInAllocaAttr()) + NonEscapes.insert(&Arg); + + // Record return values. + for (BasicBlock &BB : F) + if (auto *RI = dyn_cast(BB.getTerminator())) + if (Value *RetVal = RI->getReturnValue()) + Returns.insert(GetUnderlyingObject(RetVal, DL)); + } +}; + +class AliasCache { + const SmallVectorImpl &Worklist; + DenseMap, AliasResult> Aliases; + // ^ Caches aliases between memcpy-like kill locs with each class. + SmallVector, 8> ClassAliases; + // ^ Caches aliasing info between occurrence classes. + DenseMap, ModRefInfo> MRI; + AliasAnalysis &AA; + +public: + AliasCache(const SmallVectorImpl &Worklist, AliasAnalysis &AA) + : Worklist(Worklist), AA(AA) {} + + AliasResult alias(RedIdx A, const MemoryLocation &Loc) { + auto Key = std::make_pair(A, Loc); + return Aliases.count(Key) ? Aliases[Key] + : (Aliases[Key] = AA.alias(Worklist[A].Loc, Loc)); + } + + AliasResult alias(RedIdx A, RedIdx B) { + return ClassAliases[std::max(A, B)][std::min(A, B)]; + } + + decltype(ClassAliases)::value_type &push() { + ClassAliases.emplace_back(); + return ClassAliases.back(); + } + + void pop() { ClassAliases.pop_back(); } + + ModRefInfo getModRefInfo(RedIdx A, const Instruction &I) { + auto Key = std::make_pair(A, &I); + return MRI.count(Key) ? MRI[Key] + : (MRI[Key] = AA.getModRefInfo(&I, Worklist[A].Loc)); + } +}; + +using InstOrReal = PointerUnion; + +struct BlockInfo { + std::list Insts; + std::list Occs; + std::list Lambdas; +}; + +struct PDSE { + Function &F; + AliasAnalysis &AA; + PostDominatorTree &PDT; + const TargetLibraryInfo &TLI; + + unsigned NextID; + AliasCache AC; + EscapeTracker Tracker; + DenseMap Blocks; + SmallVector DeadStores; + SmallVector Worklist; + RealOcc DeadOnExit; + // ^ A faux occurrence used to detect stores to non-escaping memory that are + // redundant with respect to function exit. + + PDSE(Function &F, AliasAnalysis &AA, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI) + : F(F), AA(AA), PDT(PDT), TLI(TLI), NextID(1), AC(Worklist, AA), + Tracker(F, TLI), DeadOnExit(0, *F.getEntryBlock().getTerminator()) {} + + // If Inst has the potential to be a DSE candidate, return its write + // location + // and a real occurrence wrapper. + Optional> makeRealOcc(Instruction &I) { + using std::make_pair; + if (auto *SI = dyn_cast(&I)) { + return make_pair(MemoryLocation::get(SI), RealOcc(NextID++, I)); + } else if (auto *MI = dyn_cast(&I)) { + return make_pair(MemoryLocation::getForDest(MI), RealOcc(NextID++, I)); + } else if (auto *MI = dyn_cast(&I)) { + // memmove, memcpy. + return make_pair(MemoryLocation::getForDest(MI), + RealOcc(NextID++, I, MemoryLocation::getForSource(MI))); + } + return None; + } + + RedIdx assignClass(const MemoryLocation &Loc, RealOcc &Occ, + DenseMap &BelongsToClass) { + if (BelongsToClass.count(Loc)) + return Occ.setClass(BelongsToClass[Loc]); + + auto &CachedAliases = AC.push(); + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) { + RedClass &Class = Worklist[Idx]; + CachedAliases.emplace_back(AA.alias(Class.Loc, Loc)); + if (CachedAliases.back() == MustAlias && Class.Loc.Size == Loc.Size) { + AC.pop(); + return Occ.setClass(BelongsToClass[Loc] = Idx); + } + } + + // Occ doesn't belong to any existing class, so start a new class. + Worklist.emplace_back(Loc, Tracker.escapesOnUnwind(Loc), + Tracker.returned(Loc)); + RedIdx NewIdx = BelongsToClass[Worklist.back().Loc] = Worklist.size() - 1; + + // Copy must-aliases and may-alias into Overwrites and Interferes. + for (RedIdx Idx = 0; Idx < CachedAliases.size(); Idx += 1) { + if (CachedAliases[Idx] == MustAlias) { + // Found a class that could either overwrite or be overwritten by the + // new class. + if (Worklist[NewIdx].Loc.Size >= Worklist[Idx].Loc.Size) + Worklist[Idx].Overwrites.push_back(NewIdx); + else if (Worklist[NewIdx].Loc.Size <= Worklist[Idx].Loc.Size) + Worklist[NewIdx].Overwrites.push_back(Idx); + } else if (CachedAliases[Idx] != NoAlias) { + Worklist[Idx].Interferes.push_back(NewIdx); + Worklist[NewIdx].Interferes.push_back(Idx); + } + } + return Occ.setClass(NewIdx); + } + + struct RenameState { + struct Incoming { + Occurrence *ReprOcc; + BasicBlock *LambdaPred; + // ^ If ReprOcc is a lambda, then this is the predecessor (to the + // lambda-containing block) that post-doms us. + }; + + SmallVector States; + + RenameState(SmallVectorImpl &Worklist, RealOcc &DeadOnExit) + : States(Worklist.size()) { + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) + if (!Worklist[Idx].Escapes && !Worklist[Idx].Returned) + States[Idx] = {&DeadOnExit, nullptr}; + } + + void kill(RedIdx Idx) { States[Idx] = {nullptr, nullptr}; } + + bool live(RedIdx Idx) const { return States[Idx].ReprOcc; } + + LambdaOcc *exposedLambda(RedIdx Idx) const { + return live(Idx) ? States[Idx].ReprOcc->isLambda() : nullptr; + } + + RealOcc *exposedRepr(RedIdx Idx) const { + return live(Idx) ? States[Idx].ReprOcc->isReal() : nullptr; + } + + void updateUpSafety(RedIdx Idx) { + if (LambdaOcc *L = exposedLambda(Idx)) + L->resetUpSafe(); + } + }; + + bool canDSE(RealOcc &Occ, RenameState &S) { + // Can DSE if post-dommed by an overwrite. + return Occ.canDSE() && (S.exposedRepr(Occ.Class) || + any_of(Worklist[Occ.Class].Overwrites, + [&](RedIdx R) { return S.exposedRepr(R); })); + } + + void handleRealOcc(RealOcc &Occ, RenameState &S) { + DEBUG(dbgs() << "Hit a new occ: " << *Occ.Inst << " (" + << Occ.Inst->getParent()->getName() << ")\n"); + // Occ can't be DSE-ed, so set it as representative of its occ class. + if (!S.live(Occ.Class)) + S.States[Occ.Class] = RenameState::Incoming{&Occ, nullptr}; + else if (LambdaOcc *L = S.exposedLambda(Occ.Class)) { + L->addUse(Occ, *S.States[Occ.Class].LambdaPred); + S.States[Occ.Class] = {&Occ, nullptr}; + } + + // Find out how Occ interacts with incoming occ classes. + if (!Occ.KillLoc) + // Has no kill loc. Its store loc is only significant to incoming occ + // classes with exposed lambdas. + for (RedIdx Idx : Worklist[Occ.Class].Interferes) + S.updateUpSafety(Idx); + else + // Has a load that could kill some incoming class, in addition to the same + // store loc interaction above. + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) + if (S.live(Idx) && (AC.alias(Idx, *Occ.KillLoc) != NoAlias || + AC.alias(Idx, Occ.Class))) + S.kill(Idx); + } + + void handleMayKill(Instruction &I, RenameState &S) { + for (RedIdx Idx = 0; Idx < S.States.size(); Idx += 1) + if (S.live(Idx) && Worklist[Idx].Escapes && I.mayThrow()) { + S.kill(Idx); + } else if (S.live(Idx)) { + ModRefInfo MRI = AC.getModRefInfo(Idx, I); + if (MRI & MRI_Ref) + // Aliasing load + S.kill(Idx); + else if (MRI & MRI_Mod) + // Aliasing store + S.updateUpSafety(Idx); + } + } + + void dse(Instruction &I) { + DEBUG(dbgs() << "DSE-ing " << I << " (" << I.getParent()->getName() + << ")\n"); + ++NumStores; + DeadStores.push_back(&I); + } + + RenameState renameBlock(BasicBlock &BB, RenameState S) { + DEBUG(dbgs() << "Entering block " << BB.getName() << "\n"); + // Record this block if it precedes a lambda block. + for (RenameState::Incoming &Inc : S.States) + if (Inc.ReprOcc && Inc.ReprOcc->isLambda() && !Inc.LambdaPred) + Inc.LambdaPred = &BB; + + // Set repr occs to lambdas, if present. + for (LambdaOcc &L : Blocks[&BB].Lambdas) + S.States[L.Class] = {&L, nullptr}; + + // Simultaneously rename and DSE in post-order. + for (InstOrReal &I : reverse(Blocks[&BB].Insts)) + if (auto *Occ = I.dyn_cast()) { + if (canDSE(*Occ, S)) + dse(*Occ->Inst); + else + handleRealOcc(*Occ, S); + } else + // Not a real occ, but still a meminst that could kill or alias. + handleMayKill(*I.get(), S); + + // Lambdas directly exposed to reverse-exit are up-unsafe. + if (&BB == &BB.getParent()->getEntryBlock()) + for (LambdaOcc &L : Blocks[&BB].Lambdas) + S.updateUpSafety(L.Class); + + // Connect to predecessor lambdas. + for (BasicBlock *Pred : predecessors(&BB)) + for (LambdaOcc &L : Blocks[Pred].Lambdas) + L.addOperand(BB, S.States[L.Class].ReprOcc); + + return S; + } + + void renamePass() { + struct Entry { + DomTreeNode *Node; + DomTreeNode::iterator ChildIt; + RenameState Inner; + }; + + SmallVector Stack; + RenameState RootState(Worklist, DeadOnExit); + if (BasicBlock *Root = PDT.getRootNode()->getBlock()) + // Real and unique exit block. + Stack.push_back({PDT.getRootNode(), PDT.getRootNode()->begin(), + renameBlock(*Root, RootState)}); + else + // Multiple exits and/or infinite loops. + for (DomTreeNode *N : *PDT.getRootNode()) + Stack.push_back( + {N, N->begin(), renameBlock(*N->getBlock(), RootState)}); + + // Visit blocks in post-dom pre-order + while (!Stack.empty()) { + if (Stack.back().ChildIt == Stack.back().Node->end()) + Stack.pop_back(); + else { + DomTreeNode *Cur = *Stack.back().ChildIt++; + if (Cur->begin() != Cur->end()) + Stack.push_back({Cur, Cur->begin(), + renameBlock(*Cur->getBlock(), Stack.back().Inner)}); + else + renameBlock(*Cur->getBlock(), Stack.back().Inner); + } + } + } + + void convertPartialReds() { + // Maps a lambda block successor to either itself or its split edge block. + DenseMap SplitBlocks; + for (RedClass &Class : Worklist) { + // Determine PRE-ability of this class' lambdas. + Class.willBeAnt(); + for (LambdaOcc *L : Class.Lambdas) { + + DEBUG(L->print(dbgs() << "Trying to PRE ", Worklist) << "\n\tUses:\n"); + for (LambdaOcc::RealUse &Use : L->Uses) { + DEBUG(dbgs() << "\t\t" << Use.getInst() << " (" + << Use.getInst().getParent()->getName() << ")\n"); + } + DEBUG(dbgs() << "\tDefs:\n"); + for (LambdaOcc::Operand &Def : L->Defs) { + DEBUG(if (RealOcc *Occ = Def.Inner->isReal()) dbgs() + << "\t\t" << *Occ->Inst << " (" + << Occ->Inst->getParent()->getName() << ")\n"; + else Def.getLambda()->print(dbgs() << "\t", Worklist) << "\n"); + } + + if (L->NullDefs.empty()) { + // Already fully redundant, no PRE needed, trivially DSEs its uses. + DEBUG(L->print(dbgs(), Worklist) << " is already fully redun\n"); + for (LambdaOcc::RealUse &Use : L->Uses) + if (Use.Occ->canDSE()) + dse(Use.getInst()); + } else if (Instruction *I = L->createInsertionOcc()) { + // L is partially redundant and can be PRE-ed. + DEBUG(L->print(dbgs(), Worklist) << " can be PRE-ed with:\n\t" << *I + << "\n"); + for (BasicBlock *Succ : L->NullDefs) { + if (SplitBlocks.count(Succ)) + Succ = SplitBlocks[Succ]; + else if (BasicBlock *Split = SplitCriticalEdge(L->Block, Succ)) + Succ = SplitBlocks[Succ] = Split; + else + Succ = SplitBlocks[Succ] = Succ; + I->insertBefore(&*Succ->begin()); + DEBUG(dbgs() << "Inserting into " << Succ->getName() << "\n"); + } + for (LambdaOcc::RealUse &Use : L->Uses) { + ++NumPartialReds; + dse(Use.getInst()); + } + } + } + } + } + + bool run() { + DenseMap BelongsToClass; + SmallVector, 8> DefBlocks; + + // Collect real occs and track their basic blocks. + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (auto LocOcc = makeRealOcc(I)) { + // Found a real occ for this instruction. + RedIdx Idx = + assignClass(LocOcc->first, LocOcc->second, BelongsToClass); + if (Idx + 1 > DefBlocks.size()) + DefBlocks.emplace_back(); + DefBlocks[Idx].insert(&BB); + Blocks[&BB].Occs.emplace_back(std::move(LocOcc->second)); + Blocks[&BB].Insts.emplace_back(&Blocks[&BB].Occs.back()); + } else if (AA.getModRefInfo(&I)) + Blocks[&BB].Insts.emplace_back(&I); + + // Insert lambdas at reverse IDF of real occs and aliasing loads. + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) { + // Find kill-only blocks. + for (BasicBlock &BB : F) + for (InstOrReal &I : Blocks[&BB].Insts) { + auto *Occ = I.dyn_cast(); + auto *II = I.dyn_cast(); + if ((Occ && Occ->KillLoc && + AC.alias(Idx, *Occ->KillLoc) != NoAlias) || + (II && AC.getModRefInfo(Idx, *II) & MRI_Ref)) { + DefBlocks[Idx].insert(&BB); + break; + } + } + + // Compute lambdas. + ReverseIDFCalculator RIDF(PDT); + RIDF.setDefiningBlocks(DefBlocks[Idx]); + SmallVector LambdaBlocks; + RIDF.calculate(LambdaBlocks); + + for (BasicBlock *BB : LambdaBlocks) { + Blocks[BB].Lambdas.emplace_back(*BB, Idx); + Worklist[Idx].Lambdas.push_back(&Blocks[BB].Lambdas.back()); + DEBUG(Blocks[BB].Lambdas.back().print(dbgs() << "Inserted ", Worklist) + << "\n"); + } + } + + renamePass(); + convertPartialReds(); + + // DSE. + while (!DeadStores.empty()) { + Instruction *Dead = DeadStores.pop_back_val(); + for (Use &U : Dead->operands()) { + Instruction *Op = dyn_cast(U); + U.set(nullptr); + if (Op && isInstructionTriviallyDead(Op, &TLI)) + DeadStores.push_back(Op); + } + Dead->eraseFromParent(); + } + + return true; + } +}; + class PDSELegacyPass : public FunctionPass { public: PDSELegacyPass() : FunctionPass(ID) { @@ -82,9 +822,10 @@ if (skipFunction(F)) return false; - return runPDSE(F, getAnalysis().getAAResults(), - getAnalysis().getPostDomTree(), - getAnalysis().getTLI()); + return PDSE(F, getAnalysis().getAAResults(), + getAnalysis().getPostDomTree(), + getAnalysis().getTLI()) + .run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -93,7 +834,6 @@ AU.addRequired(); AU.setPreservesCFG(); - AU.addPreserved(); AU.addPreserved(); } @@ -114,15 +854,17 @@ namespace llvm { PreservedAnalyses PDSEPass::run(Function &F, FunctionAnalysisManager &AM) { - if (!runPDSE(F, AM.getResult(F), - AM.getResult(F), - AM.getResult(F))) + if (!PDSE(F, AM.getResult(F), + AM.getResult(F), + AM.getResult(F)) + .run()) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet(); - PA.preserve(); PA.preserve(); return PA; } + +FunctionPass *createPDSEPass() { return new PDSELegacyPass(); } } // end namespace llvm Index: test/Transforms/DeadStoreElimination/pdse.ll =================================================================== --- /dev/null +++ test/Transforms/DeadStoreElimination/pdse.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -pdse %s | FileCheck %s + +declare void @may_throw() +declare noalias i8* @malloc(i32) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64 , i32 , i1 ) +declare void @llvm.memmove.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1) + +define void @lo_and_chow(i8* %x, i1 %br0, i1 %br1) { +; CHECK-LABEL: @lo_and_chow( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 undef, i8* [[X:%.*]] +; CHECK-NEXT: [[T:%.*]] = load i8, i8* [[X]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: store i8 undef, i8* [[X]] +; CHECK-NEXT: ret void +; +bb0: + %v = load i8, i8* %x + %v1 = add nuw i8 %v, 1 + store i8 %v1, i8* %x + br label %bb1 +bb1: + br i1 %br0, label %bb2, label %bb3 +bb2: + %t = load i8, i8* %x + br label %bb3 +bb3: + store i8 %v1, i8* %x + br i1 %br1, label %bb1, label %exit +exit: + ret void +} + +; demos the self-loop problem in post-dom tree. +; define void @f(i8* %x) { +; a: +; store i8 0, i8* %x +; switch i8 0, label %b [ +; i8 1, label %c +; ] +; b: +; store i8 1, i8* %x +; br label %b +; c: +; store i8 2, i8* %x +; br label %d +; d: +; br label %d +; e: +; store i8 3, i8* %x +; ret void +; } +; +; define void @g(i8* %a, i8* %b) { +; bb0: +; store i8 undef, i8* %b +; store i8 undef, i8* %a +; br i1 undef, label %bb1, label %bb2 +; bb1: +; %tmp0 = load i8, i8* %a +; ret void +; bb2: +; store i8 undef, i8* %a +; ret void +; } + +; define void @i(i8* noalias %x, i8* noalias %y, i1 %z) { +; %whatever = load i8, i8* %x +; br label %nextblock +; +; nextblock: +; store i8 %whatever, i8* %x +; store i8 123, i8* %x +; br i1 %z, label %nextblock, label %fin +; +; fin: +; ret void +; } + +define i8* @j(i8* %a, i8* %e, i1 %c) { +; CHECK-LABEL: @j( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[B:%.*]] = alloca i8 +; CHECK-NEXT: [[P:%.*]] = tail call i8* @malloc(i32 4) +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* [[A:%.*]], i8* nonnull [[E:%.*]], i64 64, i32 8, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[A]], i8 undef, i64 32, i32 8, i1 false) +; CHECK-NEXT: [[X:%.*]] = bitcast i8* [[A]] to i64* +; CHECK-NEXT: [[Z:%.*]] = getelementptr i64, i64* [[X]], i64 1 +; CHECK-NEXT: store i64 undef, i64* [[Z]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: store i8 0, i8* [[P]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret i8* [[P]] +; +bb0: + %b = alloca i8 + %P = tail call i8* @malloc(i32 4) + br i1 %c, label %bb1, label %bb2 +bb1: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* nonnull %a, i64 64, i32 8, i1 false) + call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* nonnull %e, i64 64, i32 8, i1 false) + call void @llvm.memset.p0i8.i64(i8* %a, i8 undef, i64 32, i32 8, i1 false) + %x = bitcast i8* %a to i64* + %z = getelementptr i64, i64* %x, i64 1 + store i8 undef, i8* %a + store i64 undef, i64* %z + store i8 undef, i8* %a + ; ^ future full elim phase should kill this + store i8 4, i8* %P + call void @may_throw() + store i8 0, i8* %P + store i8 undef, i8* %a + br label %bb3 +bb2: + br label %bb3 +bb3: + ret i8* %P +} + +define void @aliasing_load_kills(i8* %a) { +; CHECK-LABEL: @aliasing_load_kills( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 undef, i8* [[A:%.*]] +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb3: +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: [[X:%.*]] = load i8, i8* [[A]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: ret void +; +bb0: + store i8 undef, i8* %a + br label %bb1 +bb1: + store i8 undef, i8* %a + br i1 undef, label %bb2, label %bb3 +bb2: + store i8 undef, i8* %a + br label %bb4 +bb3: + %x = load i8, i8* %a + store i8 undef, i8* %a + br label %bb4 +bb4: + ret void +} + +define void @memcpy_example(i8* %a, i8* %b, i1 %br0) { +; CHECK-LABEL: @memcpy_example( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A:%.*]], i8* [[B:%.*]], i64 64, i32 8, i1 false) +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A]], i8* [[B]], i64 64, i32 8, i1 false) +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +bb0: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 64, i32 8, i1 false) + br i1 %br0, label %bb1, label %bb2 +bb1: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 64, i32 8, i1 false) + br label %bb3 +bb2: + br label %bb3 +bb3: + ret void +} + +; http://i.imgur.com/abuFdZ2.png +define void @multiple_pre(i8* %a, i8 %b, i1 %c, i1 %d) { +; CHECK-LABEL: @multiple_pre( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[R:%.*]] = add i8 [[B:%.*]], 1 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[S:%.*]] = add i8 [[R]], 2 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP0:%.*]] = phi i8 [ [[R]], [[BB2]] ], [ [[S]], [[BB1]] ] +; CHECK-NEXT: br i1 [[D:%.*]], label [[BB4:%.*]], label [[BB5:%.*]] +; CHECK: bb4: +; CHECK-NEXT: store i8 [[R]], i8* [[A:%.*]] +; CHECK-NEXT: br label [[EX:%.*]] +; CHECK: bb5: +; CHECK-NEXT: store i8 [[TMP0]], i8* [[A]] +; CHECK-NEXT: br label [[EX]] +; CHECK: ex: +; CHECK-NEXT: ret void +; +bb0: + %r = add i8 %b, 1 + br i1 %c, label %bb1, label %bb2 +bb1: + %s = add i8 %r, 2 + store i8 %s, i8* %a + br label %bb3 +bb2: + store i8 %r, i8* %a + br label %bb3 +bb3: + br i1 %d, label %bb4, label %bb5 +bb4: + store i8 %r, i8* %a + br label %ex +bb5: + br label %ex +ex: + ret void +} + +define void @unable_to_elim(i8* %a, i8 %b, i1 %c, i1 %d) { +; CHECK-LABEL: @unable_to_elim( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[R:%.*]] = add i8 [[B:%.*]], 1 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[S:%.*]] = add i8 [[R]], 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[A:%.*]], i8 [[S]], i64 1, i32 1, i1 false) +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 [[R]], i8* [[A]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: br i1 [[D:%.*]], label [[BB4:%.*]], label [[BB5:%.*]] +; CHECK: bb4: +; CHECK-NEXT: store i8 [[R]], i8* [[A]] +; CHECK-NEXT: br label [[EX:%.*]] +; CHECK: bb5: +; CHECK-NEXT: br label [[EX]] +; CHECK: ex: +; CHECK-NEXT: ret void +; +bb0: + %r = add i8 %b, 1 + br i1 %c, label %bb1, label %bb2 +bb1: + %s = add i8 %r, 2 + call void @llvm.memset.p0i8.i64(i8* %a, i8 %s, i64 1, i32 1, i1 false) + br label %bb3 +bb2: + store i8 %r, i8* %a + br label %bb3 +bb3: + br i1 %d, label %bb4, label %bb5 +bb4: + store i8 %r, i8* %a + br label %ex +bb5: + br label %ex +ex: + ret void +} + +; PDSE-ing +; +; a, b, c --- lambda --- +; | +; +------- c, b, a +; +; into: +; +; --- lambda --- a, b, c +; | +; +------- c, b, a +; +; would require multiple rounds of willBeAnt +define void @pre_blocked(i8* %a, i8* %b, i8* %c, i1 %br0) { +bb0: + store i8 1, i8* %a + store i8 1, i8* %b + store i8 1, i8* %c + br i1 %br0, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + store i8 11, i8* %c + store i8 11, i8* %b + store i8 11, i8* %a + br label %bb3 +bb3: + ret void +} + +define void @never_escapes() { +bb0: + %a = alloca i8 + br label %bb1 +bb1: + store i8 12, i8* %a + br label %bb2 +bb2: + ret void +}