Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -93,6 +93,7 @@ (void) llvm::createDomOnlyViewerPass(); (void) llvm::createDomViewerPass(); (void) llvm::createGCOVProfilerPass(); + (void) llvm::createPDSEPass(); (void) llvm::createPGOInstrumentationGenLegacyPass(); (void) llvm::createPGOInstrumentationUseLegacyPass(); (void) llvm::createPGOIndirectCallPromotionLegacyPass(); Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -152,6 +152,7 @@ bool LoadCombine; bool NewGVN; bool DisableGVNLoadPRE; + bool PDSE; bool VerifyInput; bool VerifyOutput; bool MergeFunctions; Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -75,6 +75,12 @@ //===----------------------------------------------------------------------===// // +// PDSE - This pass deletes both partially and fully redundant stores. +// +FunctionPass *createPDSEPass(); + +//===----------------------------------------------------------------------===// +// // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This // algorithm assumes instructions are dead until proven otherwise, which makes // it more successful are removing non-obviously dead instructions. Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -74,6 +74,9 @@ static cl::opt RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass")); +static cl::opt RunPDSE("enable-pdse", cl::init(false), cl::Hidden, + cl::desc("Run with PDSE instead of DSE.")); + static cl::opt RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization", cl::init(true), cl::Hidden, @@ -158,6 +161,7 @@ RerollLoops = RunLoopRerolling; LoadCombine = RunLoadCombine; NewGVN = RunNewGVN; + PDSE = RunPDSE; DisableGVNLoadPRE = false; VerifyInput = false; VerifyOutput = false; @@ -343,7 +347,8 @@ addExtensionsToPM(EP_Peephole, MPM); MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(PDSE ? createPDSEPass() + : createDeadStoreEliminationPass()); // Delete dead stores MPM.add(createLICMPass()); addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -776,8 +781,8 @@ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. - // Nuke dead stores. - PM.add(createDeadStoreEliminationPass()); + PM.add(PDSE ? createPDSEPass() // Really nuke dead stores. + : createDeadStoreEliminationPass()); // Nuke dead stores. // More loops are countable; try to optimize them. PM.add(createIndVarSimplifyPass()); Index: lib/Transforms/Scalar/PDSE.cpp =================================================================== --- lib/Transforms/Scalar/PDSE.cpp +++ lib/Transforms/Scalar/PDSE.cpp @@ -37,41 +37,912 @@ // Partial Redundancy Elimination in SSA Form // https://doi.org/10.1145/319301.319348 // -// Differences between the papers and this implementation: -// - May-throw instructions count as killing occurrences in the factored -// redundancy graph of escaping stores; -// - TODO: Figure out partial overwrite tracking. +// - TODO: Handle partial overwrite tracking during the full redundancy +// elimination phase. // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/AssemblyAnnotationWriter.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/PDSE.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +#include +#include #define DEBUG_TYPE "pdse" using namespace llvm; +STATISTIC(NumStores, "Number of stores deleted"); +STATISTIC(NumPartialReds, "Number of partial redundancies converted."); + static cl::opt PrintFRG("print-frg", cl::init(false), cl::Hidden, cl::desc("Print the factored redundancy graph of stores.")); namespace { -bool runPDSE(Function &F, AliasAnalysis &AA, const PostDominatorTree &PDT, - const TargetLibraryInfo &TLI) { - if (PrintFRG) { - DEBUG(dbgs() << "TODO: Print factored redundancy graph.\n"); - return false; +// Representations of factored redundancy graph elements. +enum struct OccTy { + Real, + Lambda, +}; + +struct RealOcc; +struct LambdaOcc; + +// Indexes PDSE.Worklist. +using RedIdx = unsigned; + +// Indexes LambdaOcc::Flags +using SubIdx = unsigned; + +struct Occurrence { + unsigned ID; + RedIdx Class; + // ^ Index of the redundancy class that this belongs to. + OccTy Type; + + const RealOcc *isReal() const { + return Type == OccTy::Real ? reinterpret_cast(this) + : nullptr; + } + + const LambdaOcc *isLambda() const { + return Type == OccTy::Lambda ? reinterpret_cast(this) + : nullptr; + } + + RealOcc *isReal() { + return Type == OccTy::Real ? reinterpret_cast(this) : nullptr; + } + + LambdaOcc *isLambda() { + return Type == OccTy::Lambda ? reinterpret_cast(this) + : nullptr; + } +}; + +struct RedClass; + +struct RealOcc final : public Occurrence { + SubIdx Subclass; + Instruction *Inst; + Occurrence *Def; + MemoryLocation KillLoc; + + RealOcc(unsigned ID, Instruction &I) + : Occurrence{ID, -1u, OccTy::Real}, Subclass(-1u), Inst(&I), KillLoc() {} + + RealOcc(unsigned ID, Instruction &I, MemoryLocation &&KillLoc) + : Occurrence{ID, -1u, OccTy::Real}, Subclass(-1u), Inst(&I), + KillLoc(KillLoc) {} + + bool canDSE() const { + if (auto *SI = dyn_cast(Inst)) { + return SI->isUnordered(); + } else if (auto *MI = dyn_cast(Inst)) { + return !MI->isVolatile(); + } else { + llvm_unreachable("Unknown real occurrence type."); + } + } + + RedIdx setClass(RedIdx Class_, SubIdx Subclass_) { + Subclass = Subclass_; + return Class = Class_; + } + + raw_ostream &print(raw_ostream &, ArrayRef) const; +}; + +Value *getStoreOp(Instruction &); +Instruction &setStoreOp(Instruction &, Value &); + +bool hasMultiplePreds(const BasicBlock &BB, bool AlreadyOnePred = true) { + auto B = pred_begin(&BB); + auto E = pred_end(&BB); + return (AlreadyOnePred || B != E) && ++B != E; +} + +struct LambdaOcc final : public Occurrence { + struct Operand { + BasicBlock *Succ; + Occurrence *Inner; + + RealOcc *hasRealUse() const { return Inner->isReal(); } + + LambdaOcc *getLambda() const { + return Inner->isReal() ? Inner->isReal()->isLambda() : Inner->isLambda(); + } + + Operand(BasicBlock &Succ, Occurrence &Inner) : Succ(&Succ), Inner(&Inner) {} + }; + + struct RealUse { + RealOcc *Occ; + + Instruction &getInst() const { return *Occ->Inst; } + }; + + struct LambdaUse { + LambdaOcc *L; + size_t OpIdx; + + Operand &getOp() const { return L->Defs[OpIdx]; } + + LambdaUse(LambdaOcc &L, size_t OpIdx) : L(&L), OpIdx(OpIdx) {} + }; + + BasicBlock *Block; + SmallVector Defs; + SmallVector NullDefs; + SmallVector Uses; + // ^ Real occurrences for which this lambda is representative (a def). Each + // use will be in the same redundancy class as this lambda (meaning that the + // stores they represent are same-sized and must-alias), but can have + // different sub-class indexes (memset, memcpy, plain store, etc.). + SmallVector LambdaUses; + // ^ These lambdas either directly use this lambda, or use a real use of this + // lambda. Needed to propagate `CanBeAnt` and `Earlier`. + + struct SubFlags { + // Consult the Kennedy et al. paper for these. + bool UpSafe; + bool CanBeAnt; + bool Earlier; + + SubFlags() : UpSafe(true), CanBeAnt(true), Earlier(true) {} + }; + + std::vector Flags; + // ^ Anticipation computation, indexed by subclass. + + bool upSafe(SubIdx Sub) const { return Flags[Sub].UpSafe; } + + bool canBeAnt(SubIdx Sub) const { return Flags[Sub].CanBeAnt; } + + bool earlier(SubIdx Sub) const { return Flags[Sub].Earlier; } + + LambdaOcc(unsigned ID, BasicBlock &Block, RedIdx Class, + unsigned NumSubclasses) + : Occurrence{ID, Class, OccTy::Lambda}, Block(&Block), Defs(), NullDefs(), + Uses(), LambdaUses(), Flags(NumSubclasses) {} + + void addUse(RealOcc &Occ) { Uses.push_back({&Occ}); } + + void addUse(LambdaOcc &L, size_t OpIdx) { LambdaUses.emplace_back(L, OpIdx); } + + LambdaOcc &addOperand(BasicBlock &Succ, Occurrence *ReprOcc) { + if (ReprOcc) { + Defs.emplace_back(Succ, *ReprOcc); + if (LambdaOcc *L = Defs.back().getLambda()) + L->addUse(*this, Defs.size() - 1); + } else + NullDefs.push_back(&Succ); + return *this; + } + + // Does this lambda have null defs that can't be filled by PRE? Yes, if: + bool hasNullNonInsertable() const { + // - Any of the defs reside on a critical edge that can't be split. + return isa(Block->getTerminator()) && + any_of(NullDefs, [](const BasicBlock *Succ) { + return hasMultiplePreds(*Succ); + }); + // TODO: EH pads shouldn't be split either. + } + + void resetUpSafe() { + for (SubFlags &F : Flags) + F.UpSafe = false; + } + + void resetUpSafe(SubIdx Sub) { Flags[Sub].UpSafe = false; } + + void resetUpSafeExcept(SubIdx Sub_) { + for (SubIdx Sub = 0; Sub < Flags.size(); Sub += 1) + if (Sub_ != Sub) + Flags[Sub].UpSafe = false; + } + + void resetCanBeAnt(SubIdx Sub) { + Flags[Sub].CanBeAnt = Flags[Sub].Earlier = false; + } + + void resetEarlier(SubIdx Sub) { Flags[Sub].Earlier = false; } + + bool willBeAnt(SubIdx Sub) const { + return Flags[Sub].CanBeAnt && !Flags[Sub].Earlier; + } + + raw_ostream &print(raw_ostream &, ArrayRef, bool = false, + SubIdx * = nullptr) const; +}; + +// Factored redundancy graph representation for each maximal group of +// must-aliasing stores. +struct RedClass { + MemoryLocation Loc; + // ^ The memory location that each RealOcc mods and must-alias. + SmallVector Overwrites; + // ^ Indices of redundancy classes that this class can DSE. + SmallVector Interferes; + // ^ Indices of redundancy classes that may-alias this class. + bool Escapes; + // ^ Upon function unwind, can Loc escape? + bool Returned; + // ^ Is Loc returned by the function? + SmallVector Lambdas; + std::vector StoreTypes; + + DenseMap, SubIdx> Subclasses; + SmallPtrSet DefBlocks; + + RedClass(MemoryLocation Loc, bool Escapes, bool Returned) + : Loc(std::move(Loc)), Escapes(Escapes), Returned(Returned), Lambdas(), + StoreTypes(), Subclasses(), DefBlocks() {} + +private: + using LambdaStack = SmallVector; + + // All of the lambda occ refinement phases follow this depth-first structure + // to propagate some lambda flag from an initial set to the rest of the graph. + // Consult figures 8 and 10 of Kennedy et al. + void depthFirst(std::function push, + std::function initial, + std::function alreadyTraversed) { + LambdaStack Stack; + + for (LambdaOcc *L : Lambdas) + if (initial(*L)) + push(*L, Stack); + + while (!Stack.empty()) { + LambdaOcc &L = *Stack.pop_back_val(); + if (!alreadyTraversed(L)) + push(L, Stack); + } + } + + // If lambda P is repr occ to an operand of lambda Q and: + // - Q is up-unsafe (i.e., there is a reverse path from Q to function entry + // that doesn't cross any real occs of Q's class), and + // - there are no real occs from P to Q, + // then we can conclude that P is up-unsafe too. We use this to propagate + // up-unsafety to the rest of the FRG. + RedClass &propagateUpUnsafe(SubIdx Sub) { + auto push = [&](LambdaOcc &L, LambdaStack &Stack) { + L.resetUpSafe(Sub); + for (LambdaOcc::Operand &Op : L.Defs) + if (!Op.hasRealUse()) + if (LambdaOcc *L = Op.getLambda()) + Stack.push_back(L); + }; + auto initialCond = [&](LambdaOcc &L) { return !L.upSafe(Sub); }; + // If the top entry of the lambda stack is up-unsafe, then it and its + // operands already been traversed. + auto &alreadyTraversed = initialCond; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + + RedClass &computeCanBeAnt(SubIdx Sub) { + auto push = [&](LambdaOcc &L, LambdaStack &Stack) { + L.resetCanBeAnt(Sub); + for (LambdaOcc::LambdaUse &Use : L.LambdaUses) + if (Use.L->canBeAnt(Sub) && !Use.getOp().hasRealUse() && + (!Use.L->upSafe(Sub) || + (isa(Use.L->Block->getTerminator()) && + hasMultiplePreds(*Use.getOp().Succ)))) + Stack.push_back(Use.L); + }; + auto initialCond = [&](LambdaOcc &L) { + return (!L.upSafe(Sub) && L.canBeAnt(Sub) && !L.NullDefs.empty()) || + L.hasNullNonInsertable(); + }; + auto alreadyTraversed = [&](LambdaOcc &L) { return !L.canBeAnt(Sub); }; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + + RedClass &computeEarlier(SubIdx Sub) { + auto push = [&](LambdaOcc &L, LambdaStack &Stack) { + L.resetEarlier(Sub); + for (LambdaOcc::LambdaUse &Use : L.LambdaUses) + if (Use.L->earlier(Sub)) + Stack.push_back(Use.L); + }; + auto initialCond = [&](LambdaOcc &L) { + return L.earlier(Sub) && any_of(L.Defs, [](const LambdaOcc::Operand &Op) { + return Op.hasRealUse(); + }); + }; + auto alreadyTraversed = [&](LambdaOcc &L) { return !L.earlier(Sub); }; + + depthFirst(push, initialCond, alreadyTraversed); + return *this; + } + +public: + RedClass &computeWillBeAnt() { + if (Lambdas.size() > 0) { + DEBUG(dbgs() << "Computing willBeAnt\n"); + for (SubIdx Sub = 0; Sub < numSubclasses(); Sub += 1) + propagateUpUnsafe(Sub).computeCanBeAnt(Sub).computeEarlier(Sub); + } + return *this; + } + + SubIdx numSubclasses() const { return StoreTypes.size(); } + + friend raw_ostream &operator<<(raw_ostream &O, const RedClass &Class) { + return O << *Class.Loc.Ptr << " x " << Class.Loc.Size; + } +}; + +raw_ostream &RealOcc::print(raw_ostream &O, ArrayRef Worklist) const { + return ID ? (O << "Real @ " << Inst->getParent()->getName() << " (" + << Worklist[Class] << ") " << *Inst) + : (O << "DeadOnExit"); +} + +raw_ostream &LambdaOcc::print(raw_ostream &O, ArrayRef Worklist, + bool UsesDefs, SubIdx *Sub) const { + O << "Lambda @ " << Block->getName() << " (" << Worklist[Class] << ")"; + if (Sub) + dbgs() << " [" << (upSafe(*Sub) ? "U " : "!U ") + << (canBeAnt(*Sub) ? "C " : "!C ") << (earlier(*Sub) ? "E " : "!E ") + << (willBeAnt(*Sub) ? "W" : "!W") << "]"; + if (UsesDefs) { + O << "\n"; + for (const LambdaOcc::RealUse &Use : Uses) + Use.Occ->print(dbgs() << "\tUse: ", Worklist) << "\n"; + for (const LambdaOcc::Operand &Def : Defs) + if (RealOcc *Occ = Def.hasRealUse()) + Occ->print(dbgs() << "\tDef: ", Worklist) << "\n"; + else + Def.getLambda()->print(dbgs() << "\tDef: ", Worklist) << "\n"; + for (const BasicBlock *BB : NullDefs) + dbgs() << "\tDef: _|_ @ " << BB->getName() << "\n"; + } + return O; +} + +class EscapeTracker { + const DataLayout &DL; + const TargetLibraryInfo &TLI; + DenseSet NonEscapes; + DenseSet Returns; + +public: + bool escapesOnUnwind(const Value *V) { + if (NonEscapes.count(V)) + return false; + if (isa(V) || + (isAllocLikeFn(V, &TLI) && !PointerMayBeCaptured(V, false, true))) { + NonEscapes.insert(V); + return false; + } + return true; + } + + bool escapesOnUnwind(const MemoryLocation &Loc) { + return escapesOnUnwind(GetUnderlyingObject(Loc.Ptr, DL)); + } + + bool returned(const Value *V) const { return Returns.count(V); } + + bool returned(const MemoryLocation &Loc) const { + return returned(GetUnderlyingObject(Loc.Ptr, DL)); + } + + EscapeTracker(Function &F, const TargetLibraryInfo &TLI) + : DL(F.getParent()->getDataLayout()), TLI(TLI) { + // Record non-escaping args. + for (Argument &Arg : F.args()) + if (Arg.hasByValOrInAllocaAttr()) + NonEscapes.insert(&Arg); + + // Record return values. + for (BasicBlock &BB : F) + if (auto *RI = dyn_cast(BB.getTerminator())) + if (Value *RetVal = RI->getReturnValue()) + Returns.insert(GetUnderlyingObject(RetVal, DL)); + } +}; + +Value *getStoreOp(Instruction &I) { + if (auto *SI = dyn_cast(&I)) { + return SI->getValueOperand(); + } else if (auto *MI = dyn_cast(&I)) { + return MI->getValue(); + } else if (auto *MI = dyn_cast(&I)) { + return MI->getRawSource(); + } else { + llvm_unreachable("Unknown real occurrence type."); + } +} + +Instruction &setStoreOp(Instruction &I, Value &V) { + if (auto *SI = dyn_cast(&I)) { + SI->setOperand(0, &V); + } else if (auto *MI = dyn_cast(&I)) { + MI->setValue(&V); + } else if (auto *MI = dyn_cast(&I)) { + MI->setSource(&V); } else { - DEBUG(dbgs() << "Dummy PDSE pass.\n"); + llvm_unreachable("Unknown real occurrence type."); } - return false; + return I; } +// If Inst has the potential to be a DSE candidate, return its write location +// and a real occurrence wrapper. +Optional> makeRealOcc(Instruction &I, + unsigned &NextID) { + using std::make_pair; + if (auto *SI = dyn_cast(&I)) { + return make_pair(MemoryLocation::get(SI), RealOcc(NextID++, I)); + } else if (auto *MI = dyn_cast(&I)) { + return make_pair(MemoryLocation::getForDest(MI), RealOcc(NextID++, I)); + } else if (auto *MI = dyn_cast(&I)) { + // memmove, memcpy. + return make_pair(MemoryLocation::getForDest(MI), + RealOcc(NextID++, I, MemoryLocation::getForSource(MI))); + } + return None; +} + +class InstOrReal { + union { + Instruction *I; + RealOcc Occ; + }; + bool IsOcc; + +public: + InstOrReal(RealOcc &&Occ) : Occ(std::move(Occ)), IsOcc(true) {} + + InstOrReal(Instruction &I) : I(&I), IsOcc(false) {} + + const RealOcc *getOcc() const { return IsOcc ? &Occ : nullptr; } + + RealOcc *getOcc() { return IsOcc ? &Occ : nullptr; } + + Instruction *getInst() const { return IsOcc ? nullptr : I; } +}; + +struct BlockInfo { + std::list Insts; + std::list Lambdas; +}; + +struct PDSE { + Function &F; + AliasAnalysis &AA; + PostDominatorTree &PDT; + const TargetLibraryInfo &TLI; + + unsigned NextID; + DenseMap, ModRefInfo> MRI; + // ^ Caches calls to AliasAnalysis::getModRefInfo. + DenseMap Blocks; + std::forward_list DeadStores; + std::vector Worklist; + RealOcc DeadOnExit; + // ^ A faux occurrence used to detect stores to non-escaping memory that are + // redundant with respect to function exit. + + PDSE(Function &F, AliasAnalysis &AA, PostDominatorTree &PDT, + const TargetLibraryInfo &TLI) + : F(F), AA(AA), PDT(PDT), TLI(TLI), NextID(1), + DeadOnExit(0, *F.getEntryBlock().getTerminator()) {} + + ModRefInfo getModRefInfo(RedIdx A, const Instruction &I) { + auto Key = std::make_pair(A, &I); + return MRI.count(Key) ? MRI[Key] + : (MRI[Key] = AA.getModRefInfo(&I, Worklist[A].Loc)); + } + + RedIdx classifyLoc(const MemoryLocation &Loc, + DenseMap &BelongsToClass, + EscapeTracker &Tracker) { + DEBUG(dbgs() << "Examining store location " << *Loc.Ptr << " x " << Loc.Size + << "\n"); + if (BelongsToClass.count(Loc)) + return BelongsToClass[Loc]; + + std::vector CachedAliases(Worklist.size()); + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) { + RedClass &Class = Worklist[Idx]; + if ((CachedAliases[Idx] = AA.alias(Class.Loc, Loc)) == MustAlias && + Class.Loc.Size == Loc.Size) + return BelongsToClass[Loc] = Idx; + } + + // Loc doesn't belong to any existing RedClass, so start a new one. + Worklist.emplace_back(Loc, Tracker.escapesOnUnwind(Loc), + Tracker.returned(Loc)); + RedIdx NewIdx = BelongsToClass[Worklist.back().Loc] = Worklist.size() - 1; + + // Copy must-/may-aliases into Overwrites and Interferes. + for (RedIdx Idx = 0; Idx < CachedAliases.size(); Idx += 1) { + if (CachedAliases[Idx] == MustAlias) { + assert(Worklist[NewIdx].Loc.Size != Worklist[Idx].Loc.Size && + "Loc should have been part of redundancy class Idx."); + if (Worklist[NewIdx].Loc.Size >= Worklist[Idx].Loc.Size) + Worklist[NewIdx].Overwrites.push_back(Idx); + else if (Worklist[NewIdx].Loc.Size <= Worklist[Idx].Loc.Size) + Worklist[Idx].Overwrites.push_back(NewIdx); + } else if (CachedAliases[Idx] != NoAlias) { + Worklist[Idx].Interferes.push_back(NewIdx); + Worklist[NewIdx].Interferes.push_back(Idx); + } + } + return NewIdx; + } + + struct RenameState { + struct Incoming { + Occurrence *ReprOcc; + }; + + std::vector States; + + RenameState(ArrayRef Worklist, RealOcc &DeadOnExit) + : States(Worklist.size()) { + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) + if (!Worklist[Idx].Escapes && !Worklist[Idx].Returned) { + DEBUG(dbgs() << "Class " << Worklist[Idx] << " is dead on exit.\n"); + States[Idx] = {&DeadOnExit}; + } + } + + bool live(RedIdx Idx) const { return States[Idx].ReprOcc; } + + LambdaOcc *exposedLambda(RedIdx Idx) const { + return live(Idx) ? States[Idx].ReprOcc->isLambda() : nullptr; + } + + RealOcc *exposedRepr(RedIdx Idx) const { + return live(Idx) ? States[Idx].ReprOcc->isReal() : nullptr; + } + }; + + void updateUpSafety(RedIdx Idx, RenameState &S) { + if (LambdaOcc *L = S.exposedLambda(Idx)) { + DEBUG(L->print(dbgs() << "Setting up-unsafe: ", Worklist) << "\n"); + L->resetUpSafe(); + } + } + + void kill(RedIdx Idx, RenameState &S) { + DEBUG(dbgs() << "Killing class " << Worklist[Idx] << "\n"); + updateUpSafety(Idx, S); + S.States[Idx] = {nullptr}; + } + + void handleRealOcc(RealOcc &Occ, RenameState &S) { + DEBUG(Occ.print(dbgs() << "Hit a new occ: ", Worklist) << "\n"); + if (!S.live(Occ.Class)) { + DEBUG(dbgs() << "Setting to new repr of " << Worklist[Occ.Class] << "\n"); + S.States[Occ.Class] = RenameState::Incoming{&Occ}; + } else if (LambdaOcc *L = S.exposedLambda(Occ.Class)) { + L->resetUpSafeExcept(Occ.Subclass); + L->addUse(Occ); + S.States[Occ.Class] = RenameState::Incoming{&Occ}; + } + + // Examine interactions with its store loc. + for (RedIdx Idx : Worklist[Occ.Class].Interferes) + updateUpSafety(Idx, S); + for (RedIdx Idx : Worklist[Occ.Class].Overwrites) + if (!S.live(Idx)) { + // Any of Idx's occs post-dommed by Occ can be DSE-ed (barring some + // intervening load that aliases Idx). Since Idx is _|_, this occ is + // Idx's new repr. + DEBUG(dbgs() << "Setting to repr of smaller: " << Worklist[Idx] + << "\n"); + S.States[Idx] = RenameState::Incoming{&Occ}; + } else + // Otherwise, if Idx is a lambda, this occ stomps its up-safety. + updateUpSafety(Idx, S); + + // Find out how Occ's KillLoc, if any, interacts with incoming occ classes. + if (Occ.KillLoc.Ptr) + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) + if (S.live(Idx) && + AA.alias(Worklist[Idx].Loc, Occ.KillLoc) != NoAlias) { + DEBUG(dbgs() << "KillLoc aliases: " + << AA.alias(Worklist[Idx].Loc, Occ.KillLoc) << " "); + kill(Idx, S); + } + } + + void handleMayKill(Instruction &I, RenameState &S) { + for (RedIdx Idx = 0; Idx < S.States.size(); Idx += 1) + if (S.live(Idx) && Worklist[Idx].Escapes && I.mayThrow()) { + kill(Idx, S); + } else if (S.live(Idx)) { + // TODO: Handle calls to `free` as DeadOnExit. + ModRefInfo MRI = getModRefInfo(Idx, I); + if (MRI & MRI_Ref) + // Aliasing load + kill(Idx, S); + else if (MRI & MRI_Mod) + // Aliasing store + updateUpSafety(Idx, S); + } + } + + void dse(Instruction &I) { + DEBUG(dbgs() << "DSE-ing " << I << " (" << I.getParent()->getName() + << ")\n"); + ++NumStores; + DeadStores.push_front(&I); + } + + RenameState renameBlock(BasicBlock &BB, RenameState S) { + DEBUG(dbgs() << "Entering block " << BB.getName() << "\n"); + // Set repr occs to lambdas, if present. + for (LambdaOcc &L : Blocks[&BB].Lambdas) + S.States[L.Class] = {&L}; + + // Simultaneously rename and DSE in post-order. + for (InstOrReal &I : reverse(Blocks[&BB].Insts)) + if (RealOcc *Occ = I.getOcc()) { + if (Occ->canDSE() && S.exposedRepr(Occ->Class)) + dse(*Occ->Inst); + else + handleRealOcc(*Occ, S); + } else + // Not a real occ, but still a meminst that could kill or alias. + handleMayKill(*I.getInst(), S); + + // Lambdas directly exposed to reverse CFG exit are up-unsafe. + if (&BB == &BB.getParent()->getEntryBlock()) + for (RedIdx Idx = 0; Idx < S.States.size(); Idx += 1) + updateUpSafety(Idx, S); + + // Connect to predecessor lambdas. + for (BasicBlock *Pred : predecessors(&BB)) + for (LambdaOcc &L : Blocks[Pred].Lambdas) + L.addOperand(BB, S.States[L.Class].ReprOcc); + + return S; + } + + void renamePass() { + struct Entry { + DomTreeNode *Node; + DomTreeNode::iterator ChildIt; + RenameState Inner; + }; + + SmallVector Stack; + RenameState RootState(Worklist, DeadOnExit); + if (BasicBlock *Root = PDT.getRootNode()->getBlock()) + // Real and unique exit block. + Stack.push_back({PDT.getRootNode(), PDT.getRootNode()->begin(), + renameBlock(*Root, RootState)}); + else + // Multiple exits and/or infinite loops. + for (DomTreeNode *N : *PDT.getRootNode()) + Stack.push_back( + {N, N->begin(), renameBlock(*N->getBlock(), RootState)}); + + // Visit blocks in post-dom pre-order + while (!Stack.empty()) { + if (Stack.back().ChildIt == Stack.back().Node->end()) + Stack.pop_back(); + else { + DomTreeNode *Cur = *Stack.back().ChildIt++; + if (Cur->begin() != Cur->end()) + Stack.push_back({Cur, Cur->begin(), + renameBlock(*Cur->getBlock(), Stack.back().Inner)}); + else + renameBlock(*Cur->getBlock(), Stack.back().Inner); + } + } + } + + using SplitEdgeMap = + DenseMap, BasicBlock *>; + + void insertNewOccs(LambdaOcc &L, SubIdx Sub, Instruction &Ins, + SSAUpdater &StoreVals, SplitEdgeMap &SplitBlocks) { + auto insert = [&](BasicBlock *Succ) { + Instruction &I = + setStoreOp(*Ins.clone(), *StoreVals.GetValueAtEndOfBlock(L.Block)); + DEBUG(dbgs() << "PRE insert: " << I << " @ " << Succ->getName() + << " (from " << L.Block->getName() << ")\n"); + + if (SplitBlocks.count({L.Block, Succ})) + Succ = SplitBlocks[{L.Block, Succ}]; + else if (BasicBlock *Split = SplitCriticalEdge(L.Block, Succ)) + Succ = SplitBlocks[{L.Block, Succ}] = Split; + else + SplitBlocks[{L.Block, Succ}] = Succ; + + // Need to insert after phis. + // - TODO: Cache insertion pos.. + // - TODO: getFirstNonPHI; cache it. and we shouldn't be breaking eh pads, + // or inserting into catchswitches. + BasicBlock::iterator InsPos = find_if(*Succ, [](const Instruction &I) { + return !isa(&I) && !isa(&I); + }); + I.insertBefore(&*InsPos); + }; + + assert(L.willBeAnt(Sub) && "Can only PRE across willBeAnt lambdas."); + for (BasicBlock *Succ : L.NullDefs) + insert(Succ); + for (LambdaOcc::Operand &Op : L.Defs) + if (!Op.hasRealUse()) + if (Op.getLambda() && !Op.getLambda()->willBeAnt(Sub)) + insert(Op.Succ); + } + + void convertPartialReds() { + SplitEdgeMap SplitBlocks; + for (RedClass &Class : Worklist) { + // Determine PRE-ability of this class' lambdas. + Class.computeWillBeAnt(); + + // TODO: Iterate by lambda, not subclass. + SSAUpdater StoreVals; + for (SubIdx Sub = 0; Sub < Class.numSubclasses(); Sub += 1) { + assert(getStoreOp(*Class.StoreTypes[Sub]) && + "Expected an analyzable store instruction."); + StoreVals.Initialize(getStoreOp(*Class.StoreTypes[Sub])->getType(), + Class.StoreTypes[Sub]->getName()); + + // Collect all possible store operand definitions that will flow into + // the inserted stores. + for (LambdaOcc *L : Class.Lambdas) { + DEBUG(L->print(dbgs() << "Analyzing ", Worklist, false, &Sub) + << "\n"); + if (L->willBeAnt(Sub)) + for (LambdaOcc::RealUse &Use : L->Uses) { + if (Use.Occ->Subclass == Sub) { + DEBUG(dbgs() << "Including " << *getStoreOp(Use.getInst()) + << "\n"); + StoreVals.AddAvailableValue(Use.getInst().getParent(), + getStoreOp(Use.getInst())); + if (Use.Occ->canDSE()) + dse(Use.getInst()); + } + } + } + for (LambdaOcc *L : Class.Lambdas) { + if (L->willBeAnt(Sub)) { + DEBUG(L->print(dbgs() << "Trying to PRE #" << Sub, Worklist, true, + &Sub)); + insertNewOccs(*L, Sub, *Class.StoreTypes[Sub], StoreVals, + SplitBlocks); + } + } + } + } + } + + // Insert lambdas at reverse IDF of real occs and aliasing loads. + void insertLambdas() { + for (RedIdx Idx = 0; Idx < Worklist.size(); Idx += 1) { + // Find kill-only blocks. + for (BasicBlock &BB : F) + for (const InstOrReal &I : Blocks[&BB].Insts) { + Instruction *II = I.getInst(); + if ((II && II->mayThrow() && Worklist[Idx].Escapes) || + (I.getOcc() && I.getOcc()->KillLoc.Ptr && + AA.alias(Worklist[Idx].Loc, I.getOcc()->KillLoc) != NoAlias) || + (II && getModRefInfo(Idx, *II) & MRI_Ref)) { + DEBUG(dbgs() << "Found kill for " << Worklist[Idx] << ": "); + if (II) + DEBUG(dbgs() << *II << "\n"); + else + DEBUG(I.getOcc()->print(dbgs(), Worklist) << "\n"); + Worklist[Idx].DefBlocks.insert(&BB); + break; + } + } + + // Compute lambdas. + ReverseIDFCalculator RIDF(PDT); + RIDF.setDefiningBlocks(Worklist[Idx].DefBlocks); + SmallVector LambdaBlocks; + RIDF.calculate(LambdaBlocks); + + for (BasicBlock *BB : LambdaBlocks) { + Blocks[BB].Lambdas.emplace_back(NextID++, *BB, Idx, + Worklist[Idx].numSubclasses()); + Worklist[Idx].Lambdas.push_back(&Blocks[BB].Lambdas.back()); + DEBUG(Blocks[BB].Lambdas.back().print(dbgs() << "Inserted ", Worklist) + << "\n"); + } + } + } + + void addRealOcc(RealOcc &&Occ, RedIdx Idx) { + auto Key = + std::make_pair(Occ.Inst->getOpcode(), getStoreOp(*Occ.Inst)->getType()); + auto Inserted = + Worklist[Idx].Subclasses.insert({Key, Worklist[Idx].numSubclasses()}); + if (Inserted.second) + Worklist[Idx].StoreTypes.push_back(Occ.Inst); + Occ.setClass(Idx, Inserted.first->second); + DEBUG(dbgs() << "Added real occ @ " << Occ.Inst->getParent()->getName() + << " " << *Occ.Inst << "\n\tto subclass " + << *Worklist[Idx].StoreTypes[Inserted.first->second] + << "\n\tof " << Worklist[Idx] << "\n"); + + BasicBlock *BB = Occ.Inst->getParent(); + Worklist[Idx].DefBlocks.insert(BB); + Blocks[BB].Insts.emplace_back(std::move(Occ)); + } + + // Collect real occs and track their basic blocks. + void collectOccurrences() { + EscapeTracker Tracker(F, TLI); + DenseMap BelongsToClass; + + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (auto LocOcc = makeRealOcc(I, NextID)) { + // Found a real occ for this instruction. Figure out which redundancy + // class its store locbelongs to. + RedIdx Idx = classifyLoc(LocOcc->first, BelongsToClass, Tracker); + addRealOcc(std::move(LocOcc->second), Idx); + } else if (AA.getModRefInfo(&I)) + Blocks[&BB].Insts.emplace_back(I); + } + + bool run() { + if (!PDT.getRootNode()) { + DEBUG(dbgs() << "FIXME: ran into the PDT bug. nothing we can do.\n"); + return false; + } + + collectOccurrences(); + insertLambdas(); + renamePass(); + convertPartialReds(); + + // DSE. + while (!DeadStores.empty()) { + Instruction *Dead = DeadStores.front(); + DeadStores.pop_front(); + for (Use &U : Dead->operands()) { + Instruction *Op = dyn_cast(U); + U.set(nullptr); + if (Op && isInstructionTriviallyDead(Op, &TLI)) + DeadStores.push_front(Op); + } + Dead->eraseFromParent(); + } + + return true; + } +}; + class PDSELegacyPass : public FunctionPass { public: PDSELegacyPass() : FunctionPass(ID) { @@ -82,9 +953,10 @@ if (skipFunction(F)) return false; - return runPDSE(F, getAnalysis().getAAResults(), - getAnalysis().getPostDomTree(), - getAnalysis().getTLI()); + return PDSE(F, getAnalysis().getAAResults(), + getAnalysis().getPostDomTree(), + getAnalysis().getTLI()) + .run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -93,7 +965,6 @@ AU.addRequired(); AU.setPreservesCFG(); - AU.addPreserved(); AU.addPreserved(); } @@ -114,15 +985,19 @@ namespace llvm { PreservedAnalyses PDSEPass::run(Function &F, FunctionAnalysisManager &AM) { - if (!runPDSE(F, AM.getResult(F), - AM.getResult(F), - AM.getResult(F))) + if (!PDSE(F, AM.getResult(F), + AM.getResult(F), + AM.getResult(F)) + .run()) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserveSet(); - PA.preserve(); PA.preserve(); return PA; } + +FunctionPass *createPDSEPass() { return new PDSELegacyPass(); } } // end namespace llvm + +// vim: set shiftwidth=2 tabstop=2: Index: test/Transforms/DeadStoreElimination/pdse.ll =================================================================== --- /dev/null +++ test/Transforms/DeadStoreElimination/pdse.ll @@ -0,0 +1,757 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -basicaa -pdse %s | FileCheck %s + +declare void @may_throw() +declare noalias i8* @malloc(i32) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64 , i32 , i1 ) +declare void @llvm.memmove.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1) + +define void @lo_and_chow(i8* %x, i1 %br0, i1 %br1) { +; CHECK-LABEL: @lo_and_chow( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[V:%.*]] = load i8, i8* [[X:%.*]] +; CHECK-NEXT: [[V1:%.*]] = add nuw i8 [[V]], 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP0:%.*]] = phi i8 [ [[V1]], [[BB3:%.*]] ], [ [[V1]], [[BB0:%.*]] ] +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB2:%.*]], label [[BB3]] +; CHECK: bb2: +; CHECK-NEXT: store i8 [[TMP0]], i8* [[X]] +; CHECK-NEXT: [[T:%.*]] = load i8, i8* [[X]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: br i1 [[BR1:%.*]], label [[BB1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: store i8 [[V1]], i8* [[X]] +; CHECK-NEXT: ret void +; +bb0: + %v = load i8, i8* %x + %v1 = add nuw i8 %v, 1 + store i8 %v1, i8* %x + br label %bb1 +bb1: + br i1 %br0, label %bb2, label %bb3 +bb2: + %t = load i8, i8* %x + br label %bb3 +bb3: + store i8 %v1, i8* %x + br i1 %br1, label %bb1, label %exit +exit: + ret void +} + +define void @lo_and_chow_maythrow(i8* %x, i1 %br0, i1 %br1) { +; CHECK-LABEL: @lo_and_chow_maythrow( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[V:%.*]] = load i8, i8* [[X:%.*]] +; CHECK-NEXT: [[V1:%.*]] = add nuw i8 [[V]], 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP0:%.*]] = phi i8 [ [[V1]], [[BB3:%.*]] ], [ [[V1]], [[BB0:%.*]] ] +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB2:%.*]], label [[BB3]] +; CHECK: bb2: +; CHECK-NEXT: store i8 [[TMP0]], i8* [[X]] +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: br i1 [[BR1:%.*]], label [[BB1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: store i8 [[V1]], i8* [[X]] +; CHECK-NEXT: ret void +; +bb0: + %v = load i8, i8* %x + %v1 = add nuw i8 %v, 1 + store i8 %v1, i8* %x + br label %bb1 +bb1: + br i1 %br0, label %bb2, label %bb3 +bb2: + call void @may_throw() + br label %bb3 +bb3: + store i8 %v1, i8* %x + br i1 %br1, label %bb1, label %exit +exit: + ret void +} + +; demos the self-loop problem in post-dom tree. +; define void @f(i8* %x) { +; a: +; store i8 0, i8* %x +; switch i8 0, label %b [ +; i8 1, label %c +; ] +; b: +; store i8 1, i8* %x +; br label %b +; c: +; store i8 2, i8* %x +; br label %d +; d: +; br label %d +; e: +; store i8 3, i8* %x +; ret void +; } +; +; define void @g(i8* %a, i8* %b) { +; bb0: +; store i8 undef, i8* %b +; store i8 undef, i8* %a +; br i1 undef, label %bb1, label %bb2 +; bb1: +; %tmp0 = load i8, i8* %a +; ret void +; bb2: +; store i8 undef, i8* %a +; ret void +; } + +; define void @i(i8* noalias %x, i8* noalias %y, i1 %z) { +; %whatever = load i8, i8* %x +; br label %nextblock +; +; nextblock: +; store i8 %whatever, i8* %x +; store i8 123, i8* %x +; br i1 %z, label %nextblock, label %fin +; +; fin: +; ret void +; } + +define i8* @j(i8* %a, i8* %e, i1 %c) { +; CHECK-LABEL: @j( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[P:%.*]] = tail call i8* @malloc(i32 4) +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* [[A:%.*]], i8* nonnull [[E:%.*]], i64 64, i32 8, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[A]], i8 undef, i64 32, i32 8, i1 false) +; CHECK-NEXT: [[X:%.*]] = bitcast i8* [[A]] to i64* +; CHECK-NEXT: [[Z:%.*]] = getelementptr i64, i64* [[X]], i64 1 +; CHECK-NEXT: store i64 undef, i64* [[Z]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: store i8 0, i8* [[P]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret i8* [[P]] +; +bb0: + %b = alloca i8 + %P = tail call i8* @malloc(i32 4) + br i1 %c, label %bb1, label %bb2 +bb1: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* nonnull %a, i64 64, i32 8, i1 false) + call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* nonnull %e, i64 64, i32 8, i1 false) + call void @llvm.memset.p0i8.i64(i8* %a, i8 undef, i64 32, i32 8, i1 false) + %x = bitcast i8* %a to i64* + %z = getelementptr i64, i64* %x, i64 1 + store i8 undef, i8* %a + store i64 undef, i64* %z + store i8 undef, i8* %a + ; ^ future full elim phase should kill this + store i8 4, i8* %P + call void @may_throw() + store i8 0, i8* %P + store i8 undef, i8* %a + br label %bb3 +bb2: + br label %bb3 +bb3: + ret i8* %P +} + +define void @aliasing_load_kills(i8* %a) { +; CHECK-LABEL: @aliasing_load_kills( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 undef, i8* [[A:%.*]] +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb3: +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: [[X:%.*]] = load i8, i8* [[A]] +; CHECK-NEXT: store i8 undef, i8* [[A]] +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: ret void +; +bb0: + store i8 undef, i8* %a + br label %bb1 +bb1: + store i8 undef, i8* %a + br i1 undef, label %bb2, label %bb3 +bb2: + store i8 undef, i8* %a + br label %bb4 +bb3: + %x = load i8, i8* %a + store i8 undef, i8* %a + br label %bb4 +bb4: + ret void +} + +define void @memcpy_example(i8* %a, i8* noalias %b, i1 %br0) { +; CHECK-LABEL: @memcpy_example( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A:%.*]], i8* [[B:%.*]], i64 64, i32 8, i1 false) +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A]], i8* [[B]], i64 64, i32 8, i1 false) +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +bb0: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 64, i32 8, i1 false) + br i1 %br0, label %bb1, label %bb2 +bb1: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 64, i32 8, i1 false) + br label %bb3 +bb2: + br label %bb3 +bb3: + ret void +} + +; http://i.imgur.com/abuFdZ2.png +define void @multiple_pre(i8* %a, i8 %b, i1 %c, i1 %d) { +; CHECK-LABEL: @multiple_pre( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[R:%.*]] = add i8 [[B:%.*]], 1 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[S:%.*]] = add i8 [[R]], 2 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP0:%.*]] = phi i8 [ [[R]], [[BB2]] ], [ [[S]], [[BB1]] ] +; CHECK-NEXT: br i1 [[D:%.*]], label [[BB4:%.*]], label [[BB5:%.*]] +; CHECK: bb4: +; CHECK-NEXT: store i8 [[R]], i8* [[A:%.*]] +; CHECK-NEXT: br label [[EX:%.*]] +; CHECK: bb5: +; CHECK-NEXT: store i8 [[TMP0]], i8* [[A]] +; CHECK-NEXT: br label [[EX]] +; CHECK: ex: +; CHECK-NEXT: ret void +; +bb0: + %r = add i8 %b, 1 + br i1 %c, label %bb1, label %bb2 +bb1: + %s = add i8 %r, 2 + store i8 %s, i8* %a + br label %bb3 +bb2: + store i8 %r, i8* %a + br label %bb3 +bb3: + br i1 %d, label %bb4, label %bb5 +bb4: + store i8 %r, i8* %a + br label %ex +bb5: + br label %ex +ex: + ret void +} + +; PRE insertion can't happen at the bb3 lambda because its uses, despite +; belonging to the same redundancy class, are altogether different instruction +; types. PDSE handles this by partitioning each redundancy class into +; like-opcode subclasses, and calculating anticipation separately for each. +; +; The i8* %a class below comprises of memset and store subclasses, and the bb3 +; lambda counts as up-unsafe (and ultimately not willBeAnt) for both because: +; For the store subclass, bb3 is exposed to a memset (which counts as an +; aliasing, non-PRE-insertable occurrence), and vice versa for the memset +; subclass. +define void @unable_to_elim(i8* %a, i8 %b, i1 %c, i1 %d) { +; CHECK-LABEL: @unable_to_elim( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[R:%.*]] = add i8 [[B:%.*]], 1 +; CHECK-NEXT: br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[S:%.*]] = add i8 [[R]], 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[A:%.*]], i8 [[S]], i64 1, i32 1, i1 false) +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 [[R]], i8* [[A]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: br i1 [[D:%.*]], label [[BB4:%.*]], label [[BB5:%.*]] +; CHECK: bb4: +; CHECK-NEXT: store i8 [[R]], i8* [[A]] +; CHECK-NEXT: br label [[EX:%.*]] +; CHECK: bb5: +; CHECK-NEXT: br label [[EX]] +; CHECK: ex: +; CHECK-NEXT: ret void +; +bb0: + %r = add i8 %b, 1 + br i1 %c, label %bb1, label %bb2 +bb1: + %s = add i8 %r, 2 + call void @llvm.memset.p0i8.i64(i8* %a, i8 %s, i64 1, i32 1, i1 false) + br label %bb3 +bb2: + store i8 %r, i8* %a + br label %bb3 +bb3: + br i1 %d, label %bb4, label %bb5 +bb4: + store i8 %r, i8* %a + br label %ex +bb5: + br label %ex +ex: + ret void +} + +; FIXME: PDSE-ing +; +; a, b, c --- lambda --- +; | +; +------- c, b, a +; +; into: +; +; --- lambda --- a, b, c +; | +; +------- c, b, a +; +; would require multiple rounds of computeWillBeAnt. +define void @pre_blocked(i8* %a, i8* %b, i8* %c, i1 %br0) { +; CHECK-LABEL: @pre_blocked( +; CHECK-NEXT: bb0: +; CHECK-NEXT: store i8 1, i8* [[A:%.*]] +; CHECK-NEXT: store i8 1, i8* [[B:%.*]] +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i8 1, i8* [[C:%.*]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 11, i8* [[C]] +; CHECK-NEXT: store i8 11, i8* [[B]] +; CHECK-NEXT: store i8 11, i8* [[A]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +bb0: + store i8 1, i8* %a + store i8 1, i8* %b + store i8 1, i8* %c + br i1 %br0, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: + store i8 11, i8* %c + store i8 11, i8* %b + store i8 11, i8* %a + br label %bb3 +bb3: + ret void +} + +; FIXME: Should transform this: +; +; s, s' --- lambda --- +; | +; +------- s +; +; into: +; +; --- lambda --- s, s' +; | +; +------- s', s +define void @pre_blocked_again(i64* %a, i1 %br0) { +; CHECK-LABEL: @pre_blocked_again( +; CHECK-NEXT: bb0: +; CHECK-NEXT: store i64 1, i64* [[A:%.*]] +; CHECK-NEXT: [[X:%.*]] = bitcast i64* [[A]] to i8* +; CHECK-NEXT: [[B:%.*]] = getelementptr i8, i8* [[X]], i64 1 +; CHECK-NEXT: store i8 2, i8* [[B]] +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i64 1, i64* [[A]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +bb0: + store i64 1, i64* %a + %x = bitcast i64* %a to i8* + %b = getelementptr i8, i8* %x, i64 1 + store i8 2, i8* %b + br i1 %br0, label %bb1, label %bb2 +bb1: + store i64 1, i64* %a + br label %bb3 +bb2: + br label %bb3 +bb3: + ret void +} + +define void @never_escapes() { +; CHECK-LABEL: @never_escapes( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; +bb0: + %a = alloca i8 + br label %bb1 +bb1: + store i8 12, i8* %a + br label %bb2 +bb2: + ret void +} + +define void @propagate_up_unsafety(i8* %a, i1 %br0, i1 %br1, i1 %br2) { +; CHECK-LABEL: @propagate_up_unsafety( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[BR2:%.*]], label [[BB5:%.*]], label [[BB6:%.*]] +; CHECK: bb5: +; CHECK-NEXT: br label [[BB0:%.*]] +; CHECK: bb6: +; CHECK-NEXT: br label [[BB0]] +; CHECK: bb0: +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 [[BR1:%.*]], label [[BB3:%.*]], label [[BB4:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 1, i8* [[A:%.*]] +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: br label [[BB0]] +; CHECK: bb4: +; CHECK-NEXT: store i8 3, i8* [[A]] +; CHECK-NEXT: ret void +; +entry: + br i1 %br2, label %bb5, label %bb6 +bb5: + br label %bb0 +bb6: + store i8 12, i8* %a + br label %bb0 +bb0: + br i1 %br0, label %bb1, label %bb2 +bb1: + br i1 %br1, label %bb3, label %bb4 +bb2: + store i8 1, i8* %a + ret void +bb3: + br label %bb0 +bb4: + store i8 3, i8* %a + ret void +} + +define void @multiple_insertions(i8* %a, i32 %br0) { +; CHECK-LABEL: @multiple_insertions( +; CHECK-NEXT: bb0: +; CHECK-NEXT: switch i32 [[BR0:%.*]], label [[BB1:%.*]] [ +; CHECK-NEXT: i32 1, label [[BB2:%.*]] +; CHECK-NEXT: i32 2, label [[BB3:%.*]] +; CHECK-NEXT: ] +; CHECK: bb1: +; CHECK-NEXT: store i8 12, i8* [[A:%.*]] +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 12, i8* [[A]] +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb3: +; CHECK-NEXT: store i8 12, i8* [[A]] +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: ret void +; +bb0: + store i8 12, i8* %a + switch i32 %br0, label %bb1 [ + i32 1, label %bb2 + i32 2, label %bb3 + ] +bb1: + store i8 12, i8* %a + br label %bb4 +bb2: + br label %bb4 +bb3: + br label %bb4 +bb4: + ret void +} + +; The lambda at bb1 can be PRE-ed, but the insertion comes from behind an +; earlier lambda -- the one at bb0. +define void @propagate_uses(i8* %a, i1 %br0, i1 %br1) { +; CHECK-LABEL: @propagate_uses( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 [[BR0]], label [[BB3:%.*]], label [[BB1_BB4_CRIT_EDGE:%.*]] +; CHECK: bb1.bb4_crit_edge: +; CHECK-NEXT: store i8 1, i8* [[A:%.*]] +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 1, i8* [[A]] +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb3: +; CHECK-NEXT: store i8 2, i8* [[A]] +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb5: +; CHECK-NEXT: ret void +; +bb0: + store i8 1, i8* %a + br i1 %br0, label %bb1, label %bb2 +bb1: + br i1 %br0, label %bb3, label %bb4 +bb2: + br label %bb5 +bb3: + store i8 2, i8* %a + br label %bb4 +bb4: + br label %bb5 +bb5: + ret void +} + +; Check that renaming rules handle overwrites correctly. +define void @small_store_can_dse(i8*, i8* noalias) { +; CHECK-LABEL: @small_store_can_dse( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0:%.*]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64* +; CHECK-NEXT: store i64 3, i64* [[TMP4]] +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP2]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1:%.*]], i64 8, i32 1, i1 false) +; CHECK-NEXT: ret void +; +bb0: + store i8 1, i8* %0 + %2 = getelementptr inbounds i8, i8* %0, i64 2 + %3 = load i8, i8* %2 + %4 = bitcast i8* %0 to i64* + store i64 3, i64* %4 + br label %bb1 +bb1: + %5 = load i8, i8* %0 + store i8 1, i8* %0 + %6 = load i8, i8* %2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 8, i32 1, i1 false) + ret void +} + +; Nothing should be inserted. The i64 store should be used by the lambda in bb0. +define void @no_extra_insert(i8* %a, i1 %br0) { +; CHECK-LABEL: @no_extra_insert( +; CHECK-NEXT: bb0: +; CHECK-NEXT: br i1 [[BR0:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[A:%.*]] to i64* +; CHECK-NEXT: store i64 2, i64* [[B]] +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 3, i8* [[A]] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +bb0: + store i8 1, i8* %a + br i1 %br0, label %bb1, label %bb2 +bb1: + %b = bitcast i8* %a to i64* + store i64 2, i64* %b + br label %exit +bb2: + store i8 3, i8* %a + br label %exit +exit: + ret void +} + +; No DSE because the memmove doubles as an aliasing load. +define void @kills_own_occ_class(i8* %a, i8* %b) { +; CHECK-LABEL: @kills_own_occ_class( +; CHECK-NEXT: store i8 3, i8* [[A:%.*]] +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* [[A]], i8* nonnull [[B:%.*]], i64 1, i32 8, i1 false) +; CHECK-NEXT: ret void +; + store i8 3, i8* %a + call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* nonnull %b, i64 1, i32 8, i1 false) + ret void +} + +; i1* %tmp and i8* %arg belong to the same redundancy class, but have different +; types. So subclasses need to be grouped by the store value operand type in +; addition to opcode. +define void @subclass_on_store_value_type_too(i8* %arg) { +; CHECK-LABEL: @subclass_on_store_value_type_too( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = bitcast i8* [[ARG:%.*]] to i1* +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i1 undef, i1* [[TMP]] +; CHECK-NEXT: ret void +; +bb: + store i8 -123, i8* %arg + %tmp = bitcast i8* %arg to i1* + br label %bb1 +bb1: + store i1 undef, i1* %tmp + br i1 undef, label %bb1, label %bb2 +bb2: + ret void +} + +define void @dont_include_nonsubclass_uses(i8* %arg) { +; CHECK-LABEL: @dont_include_nonsubclass_uses( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i8 -123, i8* [[ARG:%.*]] +; CHECK-NEXT: ret void +; +bb: + %tmp = bitcast i8* %arg to i1* + br label %bb1 +bb1: + store i1 undef, i1* %tmp + br i1 undef, label %bb1, label %bb2 +bb2: + store i8 -123, i8* %arg + ret void +} + +; The critical edge from the lambda at bb4 to bb6 is an unsplittable, which +; prevents PRE from filling its corresponding null def. PDSE therefore +; classifies bb4 as `CanBeAnt == false`. +define void @cant_split_indirectbr_edge() { +; CHECK-LABEL: @cant_split_indirectbr_edge( +; CHECK-NEXT: bb: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB2:%.*]], label %bb6] +; CHECK: bb2: +; CHECK-NEXT: indirectbr i8* undef, [label %bb3] +; CHECK: bb3: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB4:%.*]], label %bb3] +; CHECK: bb4: +; CHECK-NEXT: store float undef, float* undef, align 1 +; CHECK-NEXT: indirectbr i8* undef, [label [[BB2]], label %bb6] +; CHECK: bb6: +; CHECK-NEXT: ret void +; +bb: + indirectbr i8* undef, [label %bb2, label %bb6] +bb2: + indirectbr i8* undef, [label %bb3] +bb3: + store float undef, float* undef, align 1 + indirectbr i8* undef, [label %bb4, label %bb3] +bb4: + indirectbr i8* undef, [label %bb2, label %bb6] +bb6: + ret void +} + +; Same as cant_split_indirectbr_edge, but with a lambda use. +define void @cant_split_indirectbr_edge2() { +; CHECK-LABEL: @cant_split_indirectbr_edge2( +; CHECK-NEXT: bb: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB2:%.*]], label [[BB4:%.*]], label %bb6] +; CHECK: bb2: +; CHECK-NEXT: indirectbr i8* undef, [label %bb3] +; CHECK: bb3: +; CHECK-NEXT: store float undef, float* undef, align 1 +; CHECK-NEXT: indirectbr i8* undef, [label [[BB4]], label %bb3] +; CHECK: bb4: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB2]], label %bb6] +; CHECK: bb6: +; CHECK-NEXT: ret void +; +bb: + indirectbr i8* undef, [label %bb2, label %bb4, label %bb6] +bb2: + indirectbr i8* undef, [label %bb3] +bb3: + store float undef, float* undef, align 1 + indirectbr i8* undef, [label %bb4, label %bb3] +bb4: + indirectbr i8* undef, [label %bb2, label %bb6] +bb6: + ret void +} + +define void @cant_split2(i8* %arg) { +; CHECK-LABEL: @cant_split2( +; CHECK-NEXT: bb: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB1:%.*]], label %bb2] +; CHECK: bb1: +; CHECK-NEXT: store i8* blockaddress(@cant_split2, [[BB2:%.*]]), i8** undef, align 4 +; CHECK-NEXT: indirectbr i8* undef, [label [[BB5:%.*]], label %bb1] +; CHECK: bb2: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB4:%.*]], label %bb3] +; CHECK: bb3: +; CHECK-NEXT: indirectbr i8* undef, [label [[BB1]], label %bb5] +; CHECK: bb4: +; CHECK-NEXT: unreachable +; CHECK: bb5: +; CHECK-NEXT: ret void +; +bb: + indirectbr i8* undef, [label %bb1, label %bb2] +bb1: + store i8* blockaddress(@cant_split2, %bb2), i8** undef, align 4 + indirectbr i8* undef, [label %bb5, label %bb1] +bb2: + indirectbr i8* undef, [label %bb4, label %bb3] +bb3: + indirectbr i8* undef, [label %bb1, label %bb5] +bb4: + unreachable +bb5: + ret void +}