diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -334,6 +334,7 @@ void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&); void initializeOptimizePHIsPass(PassRegistry&); void initializePAEvalPass(PassRegistry&); +void initializePartialMemToRegLegacyPassPass(PassRegistry &); void initializePEIPass(PassRegistry&); void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&); void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -113,6 +113,13 @@ // FunctionPass *createSROAPass(); +//===----------------------------------------------------------------------===// +// +// PartialMemToReg - Converts alloca uses into phi nodes until the address +// is (potentially) captured. +// +FunctionPass *createPartialMemToRegPass(); + //===----------------------------------------------------------------------===// // // InductiveRangeCheckElimination - Transform loops to elide range checks on diff --git a/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h b/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/PartialMemToReg.h @@ -0,0 +1,53 @@ +//===- PartialMemToReg.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// Provides a pass which runs a partial mem2reg operation on allocas which +/// are deemed to be captured at some point but are used extensively +/// beforehand. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H +#define LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ValueHandle.h" +#include + +namespace llvm { + +class AssumptionCache; +class DominatorTree; +class Function; +class LoopInfo; +class PostDominatorTree; + +class PartialMemToRegLegacyPass; + +class PartialMemToReg : public PassInfoMixin { + +public: + PartialMemToReg() = default; + + /// Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + friend class PartialMemToRegLegacyPass; + + /// Helper used by both the public run method and by the legacy pass. + PreservedAnalyses runImpl(Function &F, DominatorTree &DT, + PostDominatorTree &PDT, AssumptionCache &AC, + LoopInfo &LI); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_PARTIALMEMTOREG_H diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h --- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h +++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h @@ -18,7 +18,9 @@ template class ArrayRef; class AllocaInst; +class LoopInfo; class DominatorTree; +class PostDominatorTree; class AssumptionCache; /// Return true if this alloca is legal for promotion. @@ -27,7 +29,7 @@ /// (transitively) using this alloca. This also enforces that there is only /// ever one layer of bitcasts or GEPs between the alloca and the lifetime /// markers. -bool isAllocaPromotable(const AllocaInst *AI); +bool isAllocaPromotable(const AllocaInst *AI, bool AllowCaptures = false); /// Promote the specified list of alloca instructions into scalar /// registers, inserting PHI nodes as appropriate. @@ -39,6 +41,9 @@ void PromoteMemToReg(ArrayRef Allocas, DominatorTree &DT, AssumptionCache *AC = nullptr); +bool partialPromoteMemToReg(ArrayRef Allocas, LoopInfo &LI, + DominatorTree &DT, PostDominatorTree &PDT, + AssumptionCache *AC = nullptr); } // End llvm namespace #endif diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -193,6 +193,7 @@ #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/Transforms/Scalar/PartialMemToReg.h" #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" #include "llvm/Transforms/Scalar/Reassociate.h" #include "llvm/Transforms/Scalar/Reg2Mem.h" @@ -279,6 +280,10 @@ "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3")); +static cl::opt EnablePartialMemToReg( + "enable-partial-mem2reg", cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Enable partial mem2reg SSA transformation before captures.")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -816,6 +821,10 @@ // Delete small array after loop unroll. FPM.addPass(SROA()); + // Partially promote some captured allocas to SSA form. + if (EnablePartialMemToReg) + FPM.addPass(PartialMemToReg()); + // Eliminate redundancies. FPM.addPass(MergedLoadStoreMotionPass()); if (RunNewGVN) diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -279,6 +279,7 @@ FUNCTION_PASS("objc-arc", ObjCARCOptPass()) FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) +FUNCTION_PASS("partial-mem2reg", PartialMemToReg()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -60,6 +60,7 @@ NaryReassociate.cpp NewGVN.cpp PartiallyInlineLibCalls.cpp + PartialMemToReg.cpp PlaceSafepoints.cpp Reassociate.cpp Reg2Mem.cpp diff --git a/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp b/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/PartialMemToReg.cpp @@ -0,0 +1,130 @@ +//===- PartialMemToReg.cpp --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// Provides a pass which runs a partial mem2reg operation on allocas which +/// are deemed to be captured at some point but are used extensively +/// beforehand. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/PartialMemToReg.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" + +using namespace llvm; + +#define DEBUG_TYPE "partial-mem2reg" + +namespace llvm { + +PreservedAnalyses PartialMemToReg::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult(F); + auto &PDT = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &LI = AM.getResult(F); + + return runImpl(F, DT, PDT, AC, LI); +} + +PreservedAnalyses PartialMemToReg::runImpl(Function &F, DominatorTree &DT, + PostDominatorTree &PDT, + AssumptionCache &AC, LoopInfo &LI) { + LLVM_DEBUG(dbgs() << "PartialMem2Reg on: " << F.getName() << "\n"); + SmallVector Worklist; + + BasicBlock &EntryBB = F.getEntryBlock(); + for (Instruction &I : EntryBB) + if (AllocaInst *AI = dyn_cast(&I)) + if (!isa(AI->getAllocatedType())) + Worklist.push_back(AI); + + if (!partialPromoteMemToReg(Worklist, LI, DT, PDT)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + PA.preserve(); + return PA; +} + +class PartialMemToRegLegacyPass : public FunctionPass { + PartialMemToReg Impl; + +public: + static char ID; + + PartialMemToRegLegacyPass() : FunctionPass(ID) { + initializePartialMemToRegLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto PA = Impl.runImpl( + F, getAnalysis().getDomTree(), + getAnalysis().getPostDomTree(), + getAnalysis().getAssumptionCache(F), + getAnalysis().getLoopInfo()); + + return !PA.areAllPreserved(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + + StringRef getPassName() const override { return "PartialMemToReg"; } +}; +} // end namespace llvm + +char llvm::PartialMemToRegLegacyPass::ID = 0; + +FunctionPass *llvm::createPartialMemToRegPass() { + return new PartialMemToRegLegacyPass(); +} + +INITIALIZE_PASS_BEGIN(PartialMemToRegLegacyPass, "partial-mem2reg", + "PartialMemToReg", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(PartialMemToRegLegacyPass, "partial-mem2reg", + "PartialMemToReg", false, false) \ No newline at end of file diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -92,6 +92,7 @@ initializeMergedLoadStoreMotionLegacyPassPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); + initializePartialMemToRegLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); initializeRedundantDbgInstEliminationPass(Registry); initializeRegToMemLegacyPass(Registry); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -25,7 +25,8 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/IteratedDominanceFrontier.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -45,6 +46,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include #include @@ -61,7 +63,7 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -bool llvm::isAllocaPromotable(const AllocaInst *AI) { +bool llvm::isAllocaPromotable(const AllocaInst *AI, bool AllowCaptures) { // Only allow direct and non-volatile loads and stores... for (const User *U : AI->users()) { if (const LoadInst *LI = dyn_cast(U)) { @@ -70,9 +72,12 @@ if (LI->isVolatile()) return false; } else if (const StoreInst *SI = dyn_cast(U)) { - if (SI->getValueOperand() == AI || - SI->getValueOperand()->getType() != AI->getAllocatedType()) - return false; // Don't allow a store OF the AI, only INTO the AI. + // Don't allow a store OF the AI, only INTO the AI, unless we're + // looking for captures during partialmem2reg. + if ((SI->getValueOperand() == AI || + SI->getValueOperand()->getType() != AI->getAllocatedType()) && + !AllowCaptures) + return false; // Note that atomic stores can be transformed; atomic semantics do // not have any meaning for a local alloca. if (SI->isVolatile()) @@ -124,7 +129,9 @@ /// Scan the uses of the specified alloca, filling in the AllocaInfo used /// by the rest of the pass to reason about the uses of this alloca. - void AnalyzeAlloca(AllocaInst *AI) { + void AnalyzeAlloca(AllocaInst *AI, bool Capturing = false, + PostDominatorTree *PDT = nullptr, + StoreInst *Capture = nullptr) { clear(); // As we scan the uses of the alloca instruction, keep track of stores, @@ -133,16 +140,28 @@ for (User *U : AI->users()) { Instruction *User = cast(U); + // If we're analyzing a captured alloca, only consider the users that + // are postdominated by the capture. Anything after the capture or + // in a block which may bypass the capture should not be converted + // to SSA form. + if (Capturing && !PDT->dominates(Capture, User)) + continue; + if (StoreInst *SI = dyn_cast(User)) { // Remember the basic blocks which define new values for the alloca - DefiningBlocks.push_back(SI->getParent()); - OnlyStore = SI; - } else { - LoadInst *LI = cast(User); + if (SI->getOperand(0) == AI) { + assert(Capturing && "Unexpected capture for non-captured alloca."); + UsingBlocks.push_back(SI->getParent()); + } else { + DefiningBlocks.push_back(SI->getParent()); + OnlyStore = SI; + } + } else if (LoadInst *LI = dyn_cast(User)) { // Otherwise it must be a load instruction, keep track of variable // reads. UsingBlocks.push_back(LI->getParent()); - } + } else + assert(Capturing && "Unexpected user for non-captured alloca."); if (OnlyUsedInOneBlock) { if (!OnlyBlock) @@ -243,6 +262,10 @@ /// (BBNumbers), the DenseMap is more efficient (also supports removal). DenseMap, PHINode *> NewPhiNodes; + DenseMap NewPreCaptureStores; + + DenseMap Captures; + /// For each PHI node, keep track of which entry in Allocas it corresponds /// to. DenseMap PhiToAllocaMap; @@ -271,6 +294,7 @@ nullptr, &DT, AC) {} void run(); + bool runPartial(PostDominatorTree &PDT, LoopInfo &LI); private: void RemoveFromAllocasList(unsigned &AllocaIdx) { @@ -294,6 +318,7 @@ RenamePassData::LocationVector &IncLocs, std::vector &Worklist); bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version); + bool QueuePreCaptureStore(BasicBlock *BB, unsigned AllocaIdx); }; } // end anonymous namespace @@ -543,6 +568,311 @@ return true; } +bool PromoteMem2Reg::runPartial(PostDominatorTree &PDT, LoopInfo &LI) { + Function &F = *DT.getRoot()->getParent(); + AllocaDbgUsers.resize(Allocas.size()); + AllocaInfo Info; + LargeBlockInfo LBI; + ForwardIDFCalculator IDF(DT); + + // Create a stable numbering for basic blocks to avoid any non-deterministic + // behaviour with ordering. + if (BBNumbers.empty()) { + unsigned ID = 0; + for (auto &BB : F) + BBNumbers[&BB] = ID++; + } + + for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { + AllocaInst *AI = Allocas[AllocaNum]; + StoreInst *Capture = nullptr; + LLVM_DEBUG(dbgs() << "PM2R: Analyzing alloca: " << *AI << "\n"); + + if (!isAllocaPromotable(AI, /*AllowCaptures=*/true)) { + LLVM_DEBUG(dbgs() << "PM2R: Unhandled uses.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // For now, be a bit paranoid and only consider allocas with a single + // capture user, and only stores as a capture. + // TODO: Allow for multiple captures and captures in call instructions. + bool MultiCapture = false; + for (User *U : AI->users()) { + if (StoreInst *SI = dyn_cast(U)) + if (SI->getValueOperand() == AI) { + if (!Capture) + Capture = SI; + else + MultiCapture = true; + } + } + + if (!Capture || MultiCapture) { + LLVM_DEBUG(dbgs() << "PM2R: No capture or multiple captures.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + Captures[AllocaNum] = Capture; + + // Find the set of users that are postdominated by the capture. + SmallVector PreCaptureUsers; + for (User *U : AI->users()) { + Instruction *I = cast(U); + // Ignore the capture itself. + if (Capture == I) + continue; + + // We only consider users that are postdominated by the capture -- that + // is, we _know_ that the capture will definitely be executed after the + // user. For users that are executed after the capture, or users where + // the subsequent execution path might not go through the block containing + // the capture, we don't want to convert right now. + if (PDT.dominates(Capture, I)) + PreCaptureUsers.push_back(I); + } + + // If there are no users postdominated by the capture, we won't try this + // since anything after the capture could be reached without storing + // the value into the alloca location. + // TODO: We should be able to identify all blocks which need to store + // the value into memory before entering code that may follow a capture. + if (PreCaptureUsers.empty()) { + LLVM_DEBUG(dbgs() << "PM2R: No users postdominated by capture.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // For now, only perform the partial conversion if some of the uses are + // present in a loop -- while it may be worthwhile to do this anyway, + // we're currently interested in enabling loop transformations that would + // otherwise be prevented by the presence of loads/stores to the alloca + // within the loop. + bool UsedInLoop = false; + for (Loop *L : LI) + UsedInLoop |= any_of(PreCaptureUsers, + [&L](Instruction *I) { return L->contains(I); }); + if (!UsedInLoop) { + LLVM_DEBUG(dbgs() << "PM2R: No users in loops.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // Determine which blocks define, use, and/or capture the alloca. + Info.AnalyzeAlloca(AI, /* Capturing == */ true, &PDT, Capture); + + // Unique the set of defining blocks for efficient lookup. + SmallPtrSet DefBlocks(Info.DefiningBlocks.begin(), + Info.DefiningBlocks.end()); + + // Determine which blocks the value is live in. These are blocks which lead + // to uses. + SmallPtrSet LiveInBlocks; + ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); + + if (!all_of(LiveInBlocks, [&PDT, &Capture](BasicBlock *BB) { + return PDT.dominates(Capture->getParent(), BB); + })) { + LLVM_DEBUG( + dbgs() + << "PM2R: not all live blocks are postdominated by capture.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // Ok, we've passed all our criteria for partially promoting an alloca. + // Proceed with figuring out what to do with it but throw in a few extra + // checks out of an abundance of caution. + + // Calculate dominance frontiers so we know where to plant phi nodes for + // SSA conversion. + IDF.setLiveInBlocks(LiveInBlocks); + IDF.setDefiningBlocks(DefBlocks); + SmallVector PHIBlocks; + IDF.calculate(PHIBlocks); + llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) { + return BBNumbers.find(A)->second < BBNumbers.find(B)->second; + }); + + if (PHIBlocks.empty()) { + LLVM_DEBUG(dbgs() << "PM2R: could not identify a usable phi block\n"); + continue; + } + + // Only proceed if all phi blocks are postdominated by the capture. + // Maybe this should be an assert? + if (!all_of(PHIBlocks, [&PDT, &Capture](BasicBlock *BB) { + return PDT.dominates(Capture->getParent(), BB); + })) { + LLVM_DEBUG( + dbgs() << "PM2R: not all phi blocks are postdominated by capture.\n"); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // We may want to find a better way of doing this in future, but for + // now just be paranoid and sort the blocks by postdomination order + // before adding the store to ensure the correct value is in place + // for the capture. + // TODO: Add support for phis on diverging paths that are still + // postdominated by the capture. + SmallVector PDOrderBlocks(PHIBlocks.begin(), + PHIBlocks.end()); + llvm::sort(PDOrderBlocks, [&PDT](BasicBlock *A, BasicBlock *B) { + return PDT.dominates(B, A); + }); + BasicBlock *DefBlock = PDOrderBlocks.back(); + + // Maybe this should be an assert? + if (!all_of(PDOrderBlocks, [&PDT, &DefBlock](BasicBlock *BB) { + return PDT.dominates(DefBlock, BB); + })) { + LLVM_DEBUG(dbgs() << "PM2R: definition block for capture doesn't " + "postdominate all other phi blocks."); + RemoveFromAllocasList(AllocaNum); + continue; + } + + // Point of no return; we're making changes to the IR now. + + // Remember the dbg.declare intrinsic describing this alloca, if any. + if (!Info.DbgUsers.empty()) + AllocaDbgUsers[AllocaNum] = Info.DbgUsers; + + LLVM_DEBUG(dbgs() << "PM2R: Partially promoting alloca: " << *AI << "\n"); + // Keep the reverse mapping of the 'Allocas' array for the rename pass. + AllocaLookup[Allocas[AllocaNum]] = AllocaNum; + unsigned CurrentVersion = 0; + for (BasicBlock *BB : PHIBlocks) + QueuePhiNode(BB, AllocaNum, CurrentVersion); + + // Create the store that will set the allocated memory to the right value + // before the capture occurs. + QueuePreCaptureStore(DefBlock, AllocaNum); + } + + if (Allocas.empty()) + return false; + + RenamePassData::ValVector Values(Allocas.size()); + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) + Values[i] = UndefValue::get(Allocas[i]->getAllocatedType()); + + // When handling debug info, treat all incoming values as if they have unknown + // locations until proven otherwise. + RenamePassData::LocationVector Locations(Allocas.size()); + + // Walks all basic blocks in the function performing the SSA rename algorithm + // and inserting the phi nodes we marked as necessary + std::vector RenamePassWorkList; + RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values), + std::move(Locations)); + do { + RenamePassData RPD = std::move(RenamePassWorkList.back()); + RenamePassWorkList.pop_back(); + // RenamePass may add new worklist entries. + RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList); + } while (!RenamePassWorkList.empty()); + + // The renamer uses the Visited set to avoid infinite loops. Clear it now. + Visited.clear(); + + // Loop over all of the PHI nodes and see if there are any that we can get + // rid of because they merge all of the same incoming values. This can + // happen due to undef values coming into the PHI nodes. This process is + // iterative, because eliminating one PHI node can cause others to be removed. + bool EliminatedAPHI = true; + while (EliminatedAPHI) { + EliminatedAPHI = false; + + // Iterating over NewPhiNodes is deterministic, so it is safe to try to + // simplify and RAUW them as we go. If it was not, we could add uses to + // the values we replace with in a non-deterministic order, thus creating + // non-deterministic def->use chains. + for (DenseMap, PHINode *>::iterator + I = NewPhiNodes.begin(), + E = NewPhiNodes.end(); + I != E;) { + PHINode *PN = I->second; + + // If this PHI node merges one value and/or undefs, get the value. + if (Value *V = SimplifyInstruction(PN, SQ)) { + PN->replaceAllUsesWith(V); + PN->eraseFromParent(); + NewPhiNodes.erase(I++); + EliminatedAPHI = true; + continue; + } + ++I; + } + } + + // At this point, the renamer has added entries to PHI nodes for all reachable + // code. Unfortunately, there may be unreachable blocks which the renamer + // hasn't traversed. If this is the case, the PHI nodes may not + // have incoming values for all predecessors. Loop over all PHI nodes we have + // created, inserting undef values if they are missing any incoming values. + for (DenseMap, PHINode *>::iterator + I = NewPhiNodes.begin(), + E = NewPhiNodes.end(); + I != E; ++I) { + // We want to do this once per basic block. As such, only process a block + // when we find the PHI that is the first entry in the block. + PHINode *SomePHI = I->second; + BasicBlock *BB = SomePHI->getParent(); + if (&BB->front() != SomePHI) + continue; + + // Only do work here if there the PHI nodes are missing incoming values. We + // know that all PHI nodes that were inserted in a block will have the same + // number of incoming values, so we can just check any of them. + if (SomePHI->getNumIncomingValues() == getNumPreds(BB)) + continue; + + // Get the preds for BB. + SmallVector Preds(predecessors(BB)); + + // Ok, now we know that all of the PHI nodes are missing entries for some + // basic blocks. Start by sorting the incoming predecessors for efficient + // access. + auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) { + return BBNumbers.find(A)->second < BBNumbers.find(B)->second; + }; + llvm::sort(Preds, CompareBBNumbers); + + // Now we loop through all BB's which have entries in SomePHI and remove + // them from the Preds list. + for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) { + // Do a log(n) search of the Preds list for the entry we want. + SmallVectorImpl::iterator EntIt = llvm::lower_bound( + Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers); + assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) && + "PHI node has entry for a block which is not a predecessor!"); + + // Remove the entry + Preds.erase(EntIt); + } + + // At this point, the blocks left in the preds list must have dummy + // entries inserted into every PHI nodes for the block. Update all the phi + // nodes in this block that we are inserting (there could be phis before + // mem2reg runs). + unsigned NumBadPreds = SomePHI->getNumIncomingValues(); + BasicBlock::iterator BBI = BB->begin(); + while ((SomePHI = dyn_cast(BBI++)) && + SomePHI->getNumIncomingValues() == NumBadPreds) { + Value *UndefVal = UndefValue::get(SomePHI->getType()); + for (BasicBlock *Pred : Preds) + SomePHI->addIncoming(UndefVal, Pred); + } + } + + NewPhiNodes.clear(); + + return true; +} + void PromoteMem2Reg::run() { Function &F = *DT.getRoot()->getParent(); @@ -870,6 +1200,23 @@ return true; } +/// Adds a store after the last phi node before a capturing store, so that +/// the value is up-to-date before the capture of the alloca. +bool PromoteMem2Reg::QueuePreCaptureStore(BasicBlock *BB, unsigned AllocaIdx) { + StoreInst *&SI = NewPreCaptureStores[AllocaIdx]; + PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaIdx)]; + AllocaInst *AI = Allocas[AllocaIdx]; + + if (SI) + return false; + + assert(PN && "No Phi node available for capture!\n"); + + Instruction *InsertBefore = BB->getFirstNonPHI(); + SI = new StoreInst(PN, AI, InsertBefore); + return true; +} + /// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to /// create a merged location incorporating \p DL, or to set \p DL directly. static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL, @@ -954,7 +1301,14 @@ if (AI == AllocaLookup.end()) continue; - Value *V = IncomingVals[AI->second]; + unsigned AllocaNo = AI->second; + + // If this is a load after the capture (but potentially in the same block) + // then we must not convert it. + if (Captures[AllocaNo] && DT.dominates(Captures[AllocaNo], LI)) + continue; + + Value *V = IncomingVals[AllocaNo]; // If the load was marked as nonnull we don't want to lose // that information when we erase this Load. So we preserve @@ -973,17 +1327,23 @@ if (!Dest) continue; - DenseMap::iterator ai = AllocaLookup.find(Dest); - if (ai == AllocaLookup.end()) + DenseMap::iterator AI = AllocaLookup.find(Dest); + if (AI == AllocaLookup.end()) + continue; + + unsigned AllocaNo = AI->second; + StoreInst *&PCS = NewPreCaptureStores[AllocaNo]; + + // If this is for a capture, then we don't want to remove it. + if (PCS && SI == PCS) continue; // what value were we writing? - unsigned AllocaNo = ai->second; IncomingVals[AllocaNo] = SI->getOperand(0); // Record debuginfo for the store before removing it. IncomingLocs[AllocaNo] = SI->getDebugLoc(); - for (DbgVariableIntrinsic *DII : AllocaDbgUsers[ai->second]) + for (DbgVariableIntrinsic *DII : AllocaDbgUsers[AllocaNo]) if (DII->isAddressOfVariable()) ConvertDebugDeclareToDebugValue(DII, SI, DIB); BB->getInstList().erase(SI); @@ -1019,3 +1379,9 @@ PromoteMem2Reg(Allocas, DT, AC).run(); } + +bool llvm::partialPromoteMemToReg(ArrayRef Allocas, LoopInfo &LI, + DominatorTree &DT, PostDominatorTree &PDT, + AssumptionCache *AC) { + return PromoteMem2Reg(Allocas, DT, AC).runPartial(PDT, LI); +} \ No newline at end of file diff --git a/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll b/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/partial-mem2reg.ll @@ -0,0 +1,348 @@ +; RUN: opt -partial-mem2reg -debug-only=partial-mem2reg,mem2reg -S < %s 2>&1 | FileCheck %s --check-prefix=DEBUG +; RUN: opt -partial-mem2reg -S < %s 2>&1 | FileCheck %s --check-prefix=XFORM +; RUN: opt -partial-mem2reg -gvn -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=VDEBUG +; REQUIRES: asserts + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +;; Tests based on the following C code, simplified. The OpenMP codegen from +;; clang created an outlined function where the alloca for the reduction +;; was captured by a call to the runtime and couldn't be promoted to register +;; SSA form. This prevented vectorization. +;; +;; We can now vectorize this by partially promoting the alloca, just converting +;; definitions and uses that are postdominated by the capture. This is a +;; conservative first attempt at this optimization. +;; +;; int loop(int data[restrict 128U]) +;; { +;; int retval = 0; +;; +;; #pragma omp parallel for simd schedule(simd:static) default(none) shared(data) reduction(+:retval) +;; for (int i = 0; i < 128; i++) { +;; int n = 0; +;; +;; if (data[i]) { +;; n = 1; +;; retval += n; +;; } +;; } +;; return retval; +;; } + +; DEBUG-LABEL: PartialMem2Reg on: captured_reduction +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: Partially promoting alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. + +; VDEBUG-LABEL: LV: Checking a loop in "captured_reduction" from +; VDEBUG: LV: We can vectorize this loop! + +; XFORM-LABEL: @captured_reduction +define i32 @captured_reduction(i32* nocapture nonnull readonly %data, i32 %n) { +entry: + %retval = alloca i32, align 4 + %red_list = alloca i32*, align 8 + store i32 0, i32* %retval, align 4 + %limit = zext i32 %n to i64 + br label %loop.ph + +loop.ph: + %iter.check = icmp ugt i64 %limit, 0 + br i1 %iter.check, label %loop.body, label %loop.exit + +; XFORM: loop.body: +; XFORM-NEXT: %retval.0 = phi i32 [ 0, %loop.ph ], [ %retval.1, %if.end ] +loop.body: + %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %pred = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %pred, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, 1 + store i32 %rdx.inc, i32* %retval, align 4 + br label %if.end + +; XFORM: if.end: +; XFORM-NEXT: %retval.1 = phi i32 [ %retval.0, %loop.body ], [ %rdx.inc, %if.then ] +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %limit + br i1 %exitcond, label %loop.body, label %loop.exit + +; XFORM: loop.exit: +; XFORM-NEXT: %retval.2 = phi i32 [ %retval.1, %if.end ], [ 0, %loop.ph ] +; XFORM-NEXT: store i32 %retval.2, i32* %retval, align 4 +loop.exit: + br label %capture + +capture: + store i32* %retval, i32** %red_list, align 8 + %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list) + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; DEBUG-LABEL: PartialMem2Reg on: too_many_captures +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: No capture or multiple captures. +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list_2 = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. + +; VDEBUG-LABEL: LV: Checking a loop in "too_many_captures" from +; VDEBUG: LV: Can't vectorize the instructions or CFG +; VDEBUG: LV: Not vectorizing: Cannot prove legality. + +define i32 @too_many_captures(i32* nocapture nonnull readonly %data, i32 %n) { +entry: + %retval = alloca i32, align 4 + %red_list = alloca i32*, align 8 + %red_list_2 = alloca i32*, align 8 + store i32 0, i32* %retval, align 4 + %limit = zext i32 %n to i64 + br label %loop.ph + +loop.ph: + %iter.check = icmp ugt i64 %limit, 0 + br i1 %iter.check, label %loop.body, label %loop.exit + +loop.body: + %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %pred = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %pred, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, 1 + store i32 %rdx.inc, i32* %retval, align 4 + br label %if.end + +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %limit + br i1 %exitcond, label %loop.body, label %loop.exit + +loop.exit: + br label %capture + +capture: + store i32* %retval, i32** %red_list, align 8 + %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list) + store i32* %retval, i32** %red_list_2, align 8 + %1 = call i32 @capturin_ur_allocas(i32** nonnull %red_list_2) + %2 = load i32, i32* %retval, align 4 + ret i32 %2 +} + +; DEBUG-LABEL: PartialMem2Reg on: no_captures +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: No capture or multiple captures. + +; VDEBUG-LABEL: LV: Checking a loop in "no_captures" from +; VDEBUG: LV: Can't vectorize the instructions or CFG +; VDEBUG: LV: Not vectorizing: Cannot prove legality. + +define i32 @no_captures(i32* nocapture nonnull readonly %data, i32 %n) { +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %limit = zext i32 %n to i64 + br label %loop.ph + +loop.ph: + %iter.check = icmp ugt i64 %limit, 0 + br i1 %iter.check, label %loop.body, label %loop.exit + +loop.body: + %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %pred = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %pred, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, 1 + store i32 %rdx.inc, i32* %retval, align 4 + br label %if.end + +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %limit + br i1 %exitcond, label %loop.body, label %loop.exit + +loop.exit: + br label %nocapture + +nocapture: + %0 = load i32, i32* %retval, align 4 + ret i32 %0 +} + +; DEBUG-LABEL: PartialMem2Reg on: no_postdominated_users +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: No users postdominated by capture. +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. + +; VDEBUG-LABEL: LV: Checking a loop in "no_postdominated_users" from +; VDEBUG: LV: We can vectorize this loop! + +define i32 @no_postdominated_users(i32* nocapture nonnull readonly %data, i32 %n) { +entry: + %retval = alloca i32, align 4 + %red_list = alloca i32*, align 8 + %limit = zext i32 %n to i64 + br label %loop.ph + +loop.ph: + %iter.check = icmp ugt i64 %limit, 0 + br i1 %iter.check, label %loop.body, label %loop.exit + +loop.body: + %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %pred = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %pred, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + br label %if.end + +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %limit + br i1 %exitcond, label %loop.body, label %loop.exit + +loop.exit: + br label %capture + +capture: + store i32* %retval, i32** %red_list, align 8 + %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list) + store i32 0, i32* %retval, align 4 + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; DEBUG-LABEL: PartialMem2Reg on: no_loops +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: No users in loops. +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. + +define i32 @no_loops(i32* nocapture nonnull readonly %data, i32 %n) { +entry: + %retval = alloca i32, align 4 + %red_list = alloca i32*, align 8 + store i32 0, i32* %retval, align 4 + %limit = zext i32 %n to i64 + br label %capture + +capture: + store i32* %retval, i32** %red_list, align 8 + %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list) + store i32 0, i32* %retval, align 4 + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +; DEBUG-LABEL: PartialMem2Reg on: live_block_not_postdominated +; DEBUG-NEXT: PM2R: Analyzing alloca: %retval = alloca i32, align 4 +; DEBUG-NEXT: PM2R: not all live blocks are postdominated by capture. +; DEBUG-NEXT: PM2R: Analyzing alloca: %red_list = alloca i32*, align 8 +; DEBUG-NEXT: PM2R: Unhandled uses. + +; VDEBUG-LABEL: LV: Checking a loop in "live_block_not_postdominated" from +; VDEBUG: LV: Can't vectorize the instructions or CFG +; VDEBUG: LV: Not vectorizing: Cannot prove legality. +; VDEBUG: LV: Checking a loop in "live_block_not_postdominated" from +; VDEBUG: LV: Can't vectorize the instructions or CFG +; VDEBUG: LV: Not vectorizing: Cannot prove legality. + +define i32 @live_block_not_postdominated(i32* nocapture nonnull readonly %data, i32 %n, i32 %cond1, i32 %cond2) { +entry: + %retval = alloca i32, align 4 + %red_list = alloca i32*, align 8 + store i32 0, i32* %retval, align 4 + %limit = zext i32 %n to i64 + br label %cond1.check + +cond1.check: + %cmp1 = icmp ugt i32 %cond1, 77 + br i1 %cmp1, label %loop.ph, label %loop2.ph + +loop.ph: + %iter.check = icmp ugt i64 %limit, 0 + br i1 %iter.check, label %loop.body, label %loop2.exit + +loop.body: + %indvars.iv = phi i64 [ 0, %loop.ph ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data, i64 %indvars.iv + %pred = load i32, i32* %arrayidx, align 4 + %tobool.not = icmp eq i32 %pred, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %rdx = load i32, i32* %retval, align 4 + %rdx.inc = add nsw i32 %rdx, 1 + store i32 %rdx.inc, i32* %retval, align 4 + br label %if.end + +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %limit + br i1 %exitcond, label %loop.body, label %loop.exit + +loop.exit: + br label %capture + +loop2.ph: + %iter.check2 = icmp ugt i64 %limit, 0 + br i1 %iter.check2, label %loop2.body, label %loop2.exit + +loop2.body: + %indvars.iv2 = phi i64 [ 0, %loop2.ph ], [ %indvars.iv.next2, %if.end2 ] + %arrayidx2 = getelementptr inbounds i32, i32* %data, i64 %indvars.iv2 + %pred2 = load i32, i32* %arrayidx2, align 4 + %tobool.not2 = icmp eq i32 %pred2, 0 + br i1 %tobool.not2, label %if.end2, label %if.then2 + +if.then2: + %rdx2 = load i32, i32* %retval, align 4 + %rdx.inc2 = add nsw i32 %rdx2, 1 + store i32 %rdx.inc2, i32* %retval, align 4 + br label %if.end2 + +if.end2: + %indvars.iv.next2 = add nsw i64 %indvars.iv2, 1 + %exitcond2 = icmp ne i64 %indvars.iv.next2, %limit + br i1 %exitcond2, label %loop2.body, label %loop2.exit + +loop2.exit: + br label %cond2.check + +cond2.check: + %cmp2 = icmp eq i32 %cond2, 403 + br i1 %cmp2, label %post.capture, label %capture + +capture: + store i32* %retval, i32** %red_list, align 8 + %0 = call i32 @capturin_ur_allocas(i32** nonnull %red_list) + br label %post.capture + +post.capture: + %1 = load i32, i32* %retval, align 4 + ret i32 %1 +} + +declare dso_local i32 @capturin_ur_allocas(i32** nonnull)