Index: include/llvm/Analysis/ParallelRegionInfo.h =================================================================== --- /dev/null +++ include/llvm/Analysis/ParallelRegionInfo.h @@ -0,0 +1,392 @@ +//===- ParallelParallelRegionInfo.h - Parallel region analysis --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PARALLELREGIONINFO_H +#define LLVM_ANALYSIS_PARALLELREGIONINFO_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" + +namespace llvm { + +class Loop; +class ParallelTask; +class ParallelRegion; + +/// The parallel region info (PRI) identifies and creates parallel regions. +/// +/// Currently the parallel region info is "lazy" in the sense that it does only +/// need to be updated if new parallel regions are created (or deleted). As this +/// should not happen very often (and only in very few places) it allows +/// transformation passes to preserve the parallel region info without +/// modifications. Additionally, it makes the analysis very lightweight in the +/// absence of parallel regions (which should be the majority of functions). +/// +/// The drawback for passes that need to deal with parallel regions explicitly +/// is the absence of a mapping from basic blocks to parallel regions. For now +/// these passes can use the createMapping() function to generate such a mapping +/// on-demand. After integration of parallel regions a separate function pass +/// could be introduced to maintain this mapping and recompute it if it was not +/// preserved by a transformation. However, at the moment there are only a small +/// number of places that require it but a lot of transformations that would +/// need to be modified to preserve it. +/// +/// @TODO The update interface to add/remove parallel regions is missing. +/// +class ParallelRegionInfo { + + /// The parallel regions that are not nested in other parallel regions. + SmallVector TopLevelParallelRegions; + +public: + ParallelRegionInfo() {} + ParallelRegionInfo(Function &F, const DominatorTree &DT) { recalculate(F, DT); } + ~ParallelRegionInfo() { releaseMemory(); } + + /// Type for the mapping of basic blocks to parallel tasks. + using ParallelTaskMappingTy = DenseMap; + + /// Identify the parallel regions in @p F from scratch. + /// + /// Note that only the extend of the parallel regions and there nesting is + /// stored but not the mapping from basic blocks to the surrounding parallel + /// task that is returned. Only the former information can be preserved easily + /// by passes but the latter is generally only valid at creation time. For an + /// on-demand computation of this mapping use the createMapping() function. + /// + /// @returns A mapping from basic blocks to the surrounding parallel task. + ParallelTaskMappingTy recalculate(Function &F, const DominatorTree &DT); + + /// Return the top-level parallel regions in this function. + /// + ///{ + SmallVectorImpl &getTopLevelParallelRegions() { + return TopLevelParallelRegions; + } + const SmallVectorImpl &getTopLevelParallelRegions() const { + return TopLevelParallelRegions; + } + ///} + + /// Check for containment in any parallel region. + ///{ + bool containedInAny(const BasicBlock *BB, const DominatorTree &DT) const; + bool containedInAny(const Instruction *I, const DominatorTree &DT) const; + bool containedInAny(const Loop *L, const DominatorTree &DT) const; + ///} + + /// Check for possible containment in any parallel region. + ///{ + bool maybeContainedInAny(const BasicBlock *BB, const DominatorTree &DT) const; + bool maybeContainedInAny(const Instruction *I, const DominatorTree &DT) const; + bool maybeContainedInAny(const Loop *L, const DominatorTree &DT) const; + ///} + + /// Return true if promoting @p AI will not interfere with parallel regions. + bool isSafeToPromote(const AllocaInst &AI, const DominatorTree &DT) const; + + /// Compute a _new_ mapping from basic block to parallel task (and region). + /// + /// This function is linear in the number of blocks contained in parallel + /// regions. + /// + /// @returns A mapping from basic blocks to the surrounding parallel task. + ParallelTaskMappingTy createMapping() const; + + /// Delete all memory allocated for parallel regions. + void releaseMemory(); + + /// Pretty print the parallel regions of the function. + ///{ + void print(raw_ostream &OS) const; + void dump() const; + ///} +}; + +/// A parallel task is a single entry, multiple exit CFG region that represents +/// code that can be executed in parallel to its sibling task. +class ParallelTask { + + /// The surrounding parallel region. + ParallelRegion &ParentRegion; + + /// The single entry block of this task. + BasicBlock &EntryBB; + + /// The set of halts (for forked tasks) or joins (for continuation tasks) that + /// terminate this task. + SetVector HaltsOrJoints; + + /// Creator functions for parallel tasks. + ///{ + ParallelTask(ParallelRegion &ParentRegion, BasicBlock &EntryBB); + + /// Add a halt or join to this parallel task. + void addHaltOrJoin(TerminatorInst &TI); + + ///} + +public: + /// Return the parallel region surrounding this task. + ParallelRegion &getParentRegion() const { return ParentRegion; } + + /// Return the start block of this parallel task, thus the successor of the + /// fork that started the parallel region. + BasicBlock &getEntry() const { return EntryBB; } + + /// Return true if this is a forked task. + bool isForkedTask() const; + + /// Return true if this is a continuation task. + bool isContinuationTask() const; + + /// Return the sibling task of this one. + ParallelTask &getSiblingTask() const; + + /// Return all halts (for forked tasks) or joins (for continuation tasks). + const SetVector &getHaltsOrJoints() const { + return HaltsOrJoints; + } + + /// The contain interface is designed deliberately different from similar + /// functions like Loop::contains(*) as it takes a dominator tree as a second + /// argument. This allows both ParallelTask and ParallelRegion to remain valid + /// even if transformations change the CFG structure inside them. As a + /// consequence there are less modifications needed in the existing code base. + /// + /// The cost for all three calls is the same as they can all be reduced to the + /// lookup of a single basic block. + ///{ + bool contains(const BasicBlock *BB, const DominatorTree &DT) const; + bool contains(const Instruction *I, const DominatorTree &DT) const; + bool contains(const Loop *L, const DominatorTree &DT) const; + ///} + + /// Potentially approximated but fast contains check. + /// + /// If mayContain returns false contains will also return false. If + /// mayContains returns true, contains can either return true or false. + /// + /// In case hasSingleExit(*) returns true, these functions are equal to + /// a call to contains. + /// + /// Note that this function is already used by the contains interface. Use it + /// only if approximate information is sufficient, otherwise just call + /// contains. + ///{ + bool mayContain(const BasicBlock *BB, const DominatorTree &DT) const; + bool mayContain(const Instruction *I, const DominatorTree &DT) const; + bool mayContain(const Loop *L, const DominatorTree &DT) const; + + /// Return true if this region has a single exit (halt or join). + bool hasSingleExit() const { return HaltsOrJoints.size() == 1;} + ///} + + /// A generic visitor interface as an alternative to an iterator. See the + /// comments if ParallelTask::contains(*) and the ParallelRegionInfo class + /// description for the rational. + /// + ///{ + + /// Type of the visitor function. + /// + /// Note that the return value indicates if the traversal should continue. + using VisitorTy = std::function; + + /// Invokes the @p Visitor on each block of the task or until @p Visitor + /// returns false. + /// + /// @param Visitor The user provided visitor function. + /// @param Recursive If not set this task will visit all contained blocks, + /// thus the second argument for the @p Visitor function will + /// always be this task. If set, parallel tasks contained in + /// this one will visit their blocks. + /// + /// @returns True, if all blocks have been visited, false otherwise. + bool visit(VisitorTy &Visitor, bool Recursive = false) const; + ///} + + /// Pretty print this parallel task. + ///{ + void print(raw_ostream &OS, unsigned indent = 0) const; + void dump() const; + ///} + + friend class ParallelRegion; + friend class ParallelRegionInfo; +}; + +/// Pretty print the parallel task @p PT to @p OS. +inline raw_ostream &operator<<(raw_ostream &OS, const ParallelTask &PT) { + PT.print(OS); + return OS; +} + +/// A parallel region is a single entry, multiple exit CFG region that +/// represents code that can be executed in parallel. It is divided in two +/// parallel tasks, the forked task and the continuation task. +class ParallelRegion { +public: + using SubRegionMapTy = DenseMap; + +private: + /// The parallel regions that are surrounded (and owned) by this one. + SubRegionMapTy ParallelSubRegions; + + /// The parallel region info analysis. + ParallelRegionInfo &PRI; + + /// The fork that starts this parallel region. + ForkInst &Fork; + + /// The parent parallel task or nullptr if this is a top-level region. + ParallelTask *ParentTask; + + /// The forked parallel task. + ParallelTask ForkedTask; + + /// The continuation parallel task. + ParallelTask ContinuationTask; + + /// Creator functions for parallel regions. + /// + ///{ + ParallelRegion(ParallelRegionInfo &PRI, ForkInst &Fork, + ParallelTask *ParentTask); + + /// Add a sub-region to this one and thereby transfer ownership. + void addParallelSubRegion(ParallelRegion &ParallelSubRegion); + ///} + +public: + ~ParallelRegion(); + + /// Return the fork that starts this parallel region. + ForkInst &getFork() const { return Fork; } + + /// Return the parallel task that contains this parallel region. + ParallelTask *getParentTask() const { return ParentTask; } + + /// Return the set of parallel sub-regions. + ///{ + SubRegionMapTy &getSubRegions() { return ParallelSubRegions; } + const SubRegionMapTy &getSubRegions() const { + return ParallelSubRegions; + } + ///} + + /// Return the forked task. + ///{ + ParallelTask &getForkedTask() { return ForkedTask; } + const ParallelTask &getForkedTask() const { return ForkedTask; } + ///} + + /// Return the continuation task. + ///{ + ParallelTask &getContinuationTask() { return ContinuationTask; } + const ParallelTask &getContinuationTask() const { return ContinuationTask; } + ///} + + /// A generic visitor interface as an alternative to an iterator. + /// See ParallelTask::visit for more information + ///{ + /// @see ParallelTask::visit(VisitorTy &Visitor, bool Recursive) + bool visit(ParallelTask::VisitorTy &Visitor, bool Recursive = false) const { + return ForkedTask.visit(Visitor, Recursive) && + ContinuationTask.visit(Visitor, Recursive); + } + ///} + + /// @see ParallelTask::contains(const BasicBlock *BB, DominatorTree &DT) + bool contains(const BasicBlock *BB, const DominatorTree &DT) const; + + /// @see ParallelTask::contains(const Instruction *I, DominatorTree &DT) + bool contains(const Instruction *I, const DominatorTree &DT) const; + + /// @see ParallelTask::contains(const Loop *L, DominatorTree &DT) + bool contains(const Loop *L, const DominatorTree &DT) const; + + /// Return true if both tasks have a single exit (halt or join). + bool hasTwoSingleExits() const { + return ForkedTask.hasSingleExit() && ContinuationTask.hasSingleExit(); + } + + /// @see ParallelTask::mayContain(const BasicBlock *BB, DominatorTree &DT) + bool mayContain(const BasicBlock *BB, const DominatorTree &DT) const; + + /// @see ParallelTask::mayContain(const Instruction *I, DominatorTree &DT) + bool mayContain(const Instruction *I, const DominatorTree &DT) const; + + /// @see ParallelTask::mayContain(const Loop *L, DominatorTree &DT) + bool mayContain(const Loop *L, const DominatorTree &DT) const; + + /// Pretty print this parallel region and all sub-regions. + ///{ + void print(raw_ostream &OS, unsigned indent = 0) const; + void dump() const; + ///} + + friend class ParallelRegionInfo; +}; + +/// Pretty print the parallel region @p PR to @p OS. +inline raw_ostream &operator<<(raw_ostream &OS, const ParallelRegion &PR) { + PR.print(OS); + return OS; +} + +/// New pass manager wrapper pass around the parallel region info. +class ParallelRegionAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + typedef ParallelRegionInfo Result; + + /// Run the analysis pass over a function and identify the parallel regions. + ParallelRegionInfo run(Function &F, FunctionAnalysisManager &AM); +}; + + +/// Function pass wrapper around the parallel region info. +class ParallelRegionInfoPass : public FunctionPass { + ParallelRegionInfo PRI; + +public: + static char ID; + ParallelRegionInfoPass() : FunctionPass(ID) {} + + /// Return the parallel region info analysis. + ///{ + ParallelRegionInfo &getParallelRegionInfo() { return PRI; } + const ParallelRegionInfo &getParallelRegionInfo() const { return PRI; } + ///} + + /// Initialize the parallel region info for this function. + bool runOnFunction(Function &F) override; + + /// Verify the analysis as well as some of the functions provided. + void verifyAnalysis() const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Pretty print the parallel regions of the function. + ///{ + void print(raw_ostream &OS, const Module *) const override; + void dump() const; + ///} +}; + +} // End llvm namespace +#endif Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -79,6 +79,13 @@ // FunctionPass *createRegionInfoPass(); + //===--------------------------------------------------------------------===// + // + // createParallelRegionInfoPass - This pass finds all parallel regions + // in a function and builds the parallel region hierarchy. + // + FunctionPass *createParallelRegionInfoPass(); + // Print module-level debug info metadata in human-readable form. ModulePass *createModuleDebugInfoPrinterPass(); Index: include/llvm/IR/InstrTypes.h =================================================================== --- include/llvm/IR/InstrTypes.h +++ include/llvm/IR/InstrTypes.h @@ -93,6 +93,18 @@ return isa(V) && classof(cast(V)); } + // \brief Returns true if this terminator relates to parallelism. + bool isForkJoinHalt() const { + switch (getOpcode()) { + case Instruction::Fork: + case Instruction::Halt: + case Instruction::Join: + return true; + default: + return false; + } + } + // \brief Returns true if this terminator relates to exception handling. bool isExceptional() const { switch (getOpcode()) { Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -263,6 +263,7 @@ void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&); void initializeOptimizePHIsPass(PassRegistry&); void initializePAEvalPass(PassRegistry &); +void initializeParallelRegionInfoPassPass(PassRegistry&); void initializePEIPass(PassRegistry&); void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&); void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -149,6 +149,7 @@ (void) llvm::createRegionOnlyViewerPass(); (void) llvm::createRegionPrinterPass(); (void) llvm::createRegionViewerPass(); + (void) llvm::createParallelRegionInfoPass(); (void) llvm::createSCCPPass(); (void) llvm::createSafeStackPass(); (void) llvm::createSROAPass(); Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -44,6 +44,7 @@ initializeDomViewerPass(Registry); initializeDomPrinterPass(Registry); initializeDomOnlyViewerPass(Registry); + initializeParallelRegionInfoPassPass(Registry); initializePostDomViewerPass(Registry); initializeDomOnlyPrinterPass(Registry); initializePostDomPrinterPass(Registry); Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -60,6 +60,7 @@ ObjCARCInstKind.cpp OptimizationDiagnosticInfo.cpp OrderedBasicBlock.cpp + ParallelRegionInfo.cpp PHITransAddr.cpp PostDominators.cpp ProfileSummaryInfo.cpp Index: lib/Analysis/ParallelRegionInfo.cpp =================================================================== --- /dev/null +++ lib/Analysis/ParallelRegionInfo.cpp @@ -0,0 +1,591 @@ +//===- ParallelRegionInfo.cpp - Parallel region detection analysis --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the ParallelRegionInfo analysis as well as the ParallelTask +// and ParallelRegion class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ParallelRegionInfo.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "parallel-region-info" + +STATISTIC(NumParallelRegions, "The # of parallel regions"); +STATISTIC(NumJoinInstructions, "The # of join instructions"); +STATISTIC(NumHaltInstructions, "The # of halt instructions"); + +//===----------------------------------------------------------------------===// +// ParallelTask implementation +// + +ParallelTask::ParallelTask(ParallelRegion &ParentRegion, BasicBlock &EntryBB) + : ParentRegion(ParentRegion), EntryBB(EntryBB) {} + +void ParallelTask::addHaltOrJoin(TerminatorInst &TI) { + assert((!isForkedTask() || isa(TI)) && + "Expected a halt instruction for a forked task!"); + assert((!isContinuationTask() || isa(TI)) && + "Expected a join instruction for a continuation task!"); + HaltsOrJoints.insert(&TI); +} + +bool ParallelTask::isForkedTask() const { + return &ParentRegion.getForkedTask() == this; +} + +bool ParallelTask::isContinuationTask() const { return !isForkedTask(); } + +ParallelTask &ParallelTask::getSiblingTask() const { + return isForkedTask() ? ParentRegion.getContinuationTask() + : ParentRegion.getForkedTask(); +} + +bool ParallelTask::contains(const BasicBlock *BB, + const DominatorTree &DT) const { + if (!mayContain(BB, DT)) + return false; + if (hasSingleExit()) + return true; + + // Fallback to a search of all blocks in this task. + VisitorTy IsNotBB = [BB](BasicBlock &CurrentBB, const ParallelTask &) { + return BB != &CurrentBB; + }; + return !visit(IsNotBB); +} + +bool ParallelTask::contains(const Instruction *I, + const DominatorTree &DT) const { + return contains(I->getParent(), DT); +} + +bool ParallelTask::contains(const Loop *L, const DominatorTree &DT) const { + // A loop can only be contained in a parallel region or task if the header is. + // At the same time it has to be contained if the header is as otherwise the + // parallel region entry (the fork block) would be in the loop and it would + // dominate the supposed header. + return contains(L->getHeader(), DT); +} + +bool ParallelTask::mayContain(const BasicBlock *BB, + const DominatorTree &DT) const { + // All contained blocks are dominated by the entry block. + if (!DT.dominates(&EntryBB, BB)) + return false; + + // If a join or halt block dominates the block it cannot be contained. + for (TerminatorInst *TI : getHaltsOrJoints()) + if (DT.properlyDominates(TI->getParent(), BB)) + return false; + + return true; +} + +bool ParallelTask::mayContain(const Instruction *I, + const DominatorTree &DT) const { + return contains(I->getParent(), DT); +} + +bool ParallelTask::mayContain(const Loop *L, const DominatorTree &DT) const { + // See contains(const Loop *, DominatorTree &) for reasoning. + return contains(L->getHeader(), DT); +} + +bool ParallelTask::visit(ParallelTask::VisitorTy &Visitor, + bool Recursive) const { + auto &SubRegionMap = ParentRegion.getSubRegions(); + DenseSet SeenBlocks; + SmallVector Worklist; + Worklist.push_back(&EntryBB); + + while (!Worklist.empty()) { + BasicBlock *CurrentBB = Worklist.pop_back_val(); + if (!SeenBlocks.insert(CurrentBB).second) + continue; + + if (!Visitor(*CurrentBB, *this)) + return false; + + if (Recursive) { + if (ForkInst *FI = dyn_cast(CurrentBB->getTerminator())) { + ParallelRegion *PR = SubRegionMap.lookup(FI); + assert(PR && "Found fork instruction but no parallel sub-region!"); + + if (!PR->visit(Visitor, Recursive)) + return false; + + ParallelTask &ContinuationTask = PR->getContinuationTask(); + for (TerminatorInst *TI : ContinuationTask.getHaltsOrJoints()) { + assert(isa(TI) && + "Expected join instructions to terminate continuation task!"); + BasicBlock *JoinBB = TI->getParent(); + Worklist.append(succ_begin(JoinBB), succ_end(JoinBB)); + } + + // Do not traverse the parallel sub-region again. + continue; + } + } + + if (HaltsOrJoints.count(CurrentBB->getTerminator())) + continue; + + Worklist.append(succ_begin(CurrentBB), succ_end(CurrentBB)); + } + + return true; +} + +void ParallelTask::print(raw_ostream &OS, unsigned indent) const { + if (isForkedTask()) + OS.indent(indent) << "Forked Task:\n"; + else + OS.indent(indent) << "Continuation Task:\n"; + OS.indent(indent) << "- Begin: " << EntryBB.getName() << "\n"; + for (TerminatorInst *TI : HaltsOrJoints) + OS.indent(indent) << "- End:" << *TI << "\n"; +} + +void ParallelTask::dump() const { print(dbgs()); } + +//===----------------------------------------------------------------------===// +// ParallelRegion implementation +// + +ParallelRegion::ParallelRegion(ParallelRegionInfo &PRI, ForkInst &Fork, + ParallelTask *ParentTask) + : PRI(PRI), Fork(Fork), ParentTask(ParentTask), + ForkedTask(*this, *Fork.getForkedBB()), + ContinuationTask(*this, *Fork.getContinuationBB()) { + if (ParentTask) + ParentTask->getParentRegion().addParallelSubRegion(*this); +} + +ParallelRegion::~ParallelRegion() { + DeleteContainerSeconds(ParallelSubRegions); +} + +void ParallelRegion::addParallelSubRegion(ParallelRegion &ParallelSubRegion) { + ParallelSubRegions[&ParallelSubRegion.getFork()] = &ParallelSubRegion; +} + +bool ParallelRegion::contains(const BasicBlock *BB, + const DominatorTree &DT) const { + // All contained blocks are dominated by the fork block. + if (!DT.properlyDominates(Fork.getParent(), BB)) + return false; + + // If the above condition is not met we let the sub-tasks handle it. + return ForkedTask.contains(BB, DT) || ContinuationTask.contains(BB, DT); +} + +bool ParallelRegion::contains(const Instruction *I, + const DominatorTree &DT) const { + return contains(I->getParent(), DT); +} + +bool ParallelRegion::contains(const Loop *L, const DominatorTree &DT) const { + // See ParallelTask::contains(const Loop *, DominatorTree &) for reasoning. + return contains(L->getHeader(), DT); +} + +bool ParallelRegion::mayContain(const BasicBlock *BB, + const DominatorTree &DT) const { + // All contained blocks are dominated by the fork block. + if (!DT.properlyDominates(Fork.getParent(), BB)) + return false; + + // If the above condition is not met we let the sub-tasks handle it. + return ForkedTask.mayContain(BB, DT) || ContinuationTask.mayContain(BB, DT); +} + +bool ParallelRegion::mayContain(const Instruction *I, + const DominatorTree &DT) const { + return mayContain(I->getParent(), DT); +} + +bool ParallelRegion::mayContain(const Loop *L, const DominatorTree &DT) const { + // See ParallelTask::contains(const Loop *, DominatorTree &) for reasoning. + return mayContain(L->getHeader(), DT); +} + +void ParallelRegion::print(raw_ostream &OS, unsigned indent) const { + OS.indent(indent) << "Parallel region:\n"; + OS.indent(indent) << "-" << Fork << "\n"; + + OS.indent(indent) << "Forked Task:\n"; + OS.indent(indent + 2) << "- Begin: " << ForkedTask.getEntry().getName() + << "\n"; + for (auto It : ParallelSubRegions) { + ParallelRegion *ParallelSubRegion = It.second; + if (ParallelSubRegion->getParentTask() == &ForkedTask) + ParallelSubRegion->print(OS, indent + 4); + } + for (TerminatorInst *TI : ForkedTask.getHaltsOrJoints()) + OS.indent(indent + 2) << "- End:" << *TI << "\n"; + + OS.indent(indent) << "Continuation Task:\n"; + OS.indent(indent + 2) << "- Begin: " << ContinuationTask.getEntry().getName() + << "\n"; + for (auto It : ParallelSubRegions) { + ParallelRegion *ParallelSubRegion = It.second; + if (ParallelSubRegion->getParentTask() == &ContinuationTask) + ParallelSubRegion->print(OS, indent + 4); + } + for (TerminatorInst *TI : ContinuationTask.getHaltsOrJoints()) + OS.indent(indent + 2) << "- End:" << *TI << "\n"; +} + +void ParallelRegion::dump() const { print(dbgs()); } + +//===----------------------------------------------------------------------===// +// ParallelRegionInfo implementation +// + +void ParallelRegionInfo::print(raw_ostream &OS) const { + for (auto *PR : TopLevelParallelRegions) + PR->print(OS); +} + +void ParallelRegionInfo::dump() const { + ParallelTaskMappingTy Mapping = createMapping(); + DenseMap> ReverseMap; + for (auto It : Mapping) { + auto &Blocks = ReverseMap[It.second]; + Blocks.push_back(It.first); + } + for (auto It : ReverseMap) { + errs() << It.getFirst()->getParentRegion().getFork() << "\n"; + errs() << *It.getFirst() << "\n"; + for (auto *BB : It.second) + errs() << "\t - " << BB->getName() << "\n"; + } +} + +void ParallelRegionInfo::releaseMemory() { + DeleteContainerPointers(TopLevelParallelRegions); + TopLevelParallelRegions.clear(); +} + +ParallelRegionInfo::ParallelTaskMappingTy +ParallelRegionInfo::recalculate(Function &F, const DominatorTree &DT) { + releaseMemory(); + + // A mapping from blocks to the parallel tasks they are contained in. + ParallelTaskMappingTy BB2PTMap; + + // Scan the function first to check for fork, halt or join instructions. + // If none are found bail early. This should be removed once most + // transformation that preserve this pass do communicate that to the PM. + bool ContainsParallelTI = false; + for (BasicBlock &BB : F) { + ContainsParallelTI = BB.getTerminator()->isForkJoinHalt(); + if (ContainsParallelTI) + break; + } + if (!ContainsParallelTI) + return BB2PTMap; + + // We use reverse post order (RPO) here only for verification purposes. A + // simple CFG traversal would do just fine if the parallel IR is well-formed + // but it cannot always detect if it is not. A possible alternative to RPO + // would be to use not only dependence but also post-dependence information, + // though that might be more complicated. + ReversePostOrderTraversal RPOT(&F); + + // Traverse all blocks in the function and record + propagate information + // about parallel regions and tasks. + for (BasicBlock *BB : RPOT) { + ParallelTask *PT = const_cast(BB2PTMap.lookup(BB)); + TerminatorInst *TI = BB->getTerminator(); + + ForkInst *FI = dyn_cast(TI); + HaltInst *Halt = dyn_cast(TI); + JoinInst *Join = dyn_cast(TI); + + // If this block is not in a parallel region and not starting one, just + // continue with its successors. + if (!PT && !FI) { + assert(!Halt && "Found halt instruction outside of a parallel region!"); + assert(!Join && "Found join instruction outside of a parallel region!"); + continue; + } + + if (FI) { + // Fork instructions start a parallel region. We create it and map the + // successors to the new one. + ParallelRegion *NewPR = new ParallelRegion(*this, *FI, PT); + + // If it is a top level parallel region we additionally store it in the + // TopLevelParallelRegions container. + if (!PT) + TopLevelParallelRegions.push_back(NewPR); + + // Bookkeeping + NumParallelRegions++; + + // We add the successors of forks explicitly to the BB2PT map to be able + // to exit early here. + BB2PTMap[FI->getForkedBB()] = &NewPR->getForkedTask(); + BB2PTMap[FI->getContinuationBB()] = &NewPR->getContinuationTask(); + continue; + } + + if (Halt) { + // Halt instructions terminate the forked task but do not change the + // current parallel region. + assert(PT && "Found halt instruction outside of a parallel region!"); + assert(PT->isForkedTask() && + "Found halt instruction in continuation task!"); + ParallelRegion &PR = PT->getParentRegion(); + assert(DT.dominates(PR.getFork().getParent(), BB) && + "Parallel region fork does not dominate halt instruction!"); + assert(DT.dominates(&PT->getEntry(), BB) && + "Forked task entry does not dominate halt instruction!"); + assert(PR.getFork().getContinuationBB() == Halt->getContinuationBB() && + "Halt successor was not the continuation block!"); + PT->addHaltOrJoin(*Halt); + + // Bookkeeping + NumHaltInstructions++; + + // The forked tasks ends with the halt instruction. + continue; + } + + if (Join) { + // Join instructions terminate the continuation task and the current + // parallel region. + assert(PT && "Found join instruction outside of a parallel region!"); + assert(PT->isContinuationTask() && + "Found join instruction in forked task!"); + ParallelRegion &PR = PT->getParentRegion(); + assert(DT.dominates(PR.getFork().getParent(), BB) && + "Parallel region fork does not dominate join instruction!"); + assert(DT.dominates(&PT->getEntry(), BB) && + "Continuation task entry does not dominate join instruction!"); + PT->addHaltOrJoin(*Join); + PT = PR.getParentTask(); + + // Bookkeeping + NumJoinInstructions++; + } + + // If we left all parallel regions we do not need the BB2PTMap. + if (!PT) + continue; + + // For now we do not allow exception handling in parallel regions. + assert(!TI->isExceptional() && + "Exception handling in parallel regions is not supported yet!"); + + // Verify we do not leave a parallel region "open", thus reach a function + // exit terminator before a join. + assert(TI->getNumSuccessors() > 0 && "A parallel region was not terminated " + "before a function exit was reached"); + + // Propagate the parallel task information to the successors. + for (BasicBlock *SuccBB : successors(BB)) { + const ParallelTask *&OldTask = BB2PTMap[SuccBB]; + assert((!OldTask || OldTask == PT) && + "Basic block cannot belong to two different parallel tasks!"); + OldTask = PT; + } + } + + return BB2PTMap; +} + +ParallelRegionInfo::ParallelTaskMappingTy +ParallelRegionInfo::createMapping() const { + // While we could use the recompute algorithm here it is easier to just walk + // the parallel regions we know recursively. + ParallelTaskMappingTy BB2PTMap; + + ParallelTask::VisitorTy Visitor = [&BB2PTMap](BasicBlock &BB, + const ParallelTask &PT) { + BB2PTMap[&BB] = &PT; + return true; + }; + + for (ParallelRegion *PR : TopLevelParallelRegions) { + bool Success = PR->visit(Visitor, true); + (void)Success; + assert(Success && "Expeded walk over parallel region to visit all blocks!"); + } + + return BB2PTMap; +} + +bool ParallelRegionInfo::containedInAny(const BasicBlock *BB, + const DominatorTree &DT) const { + return std::any_of( + TopLevelParallelRegions.begin(), TopLevelParallelRegions.end(), + [BB, &DT](ParallelRegion *PR) { return PR->contains(BB, DT); }); +} + +bool ParallelRegionInfo::containedInAny(const Instruction *I, + const DominatorTree &DT) const { + return containedInAny(I->getParent(), DT); +} + +bool ParallelRegionInfo::containedInAny(const Loop *L, + const DominatorTree &DT) const { + // See ParallelTask::contains(const Loop *, DominatorTree &) for reasoning. + return containedInAny(L->getHeader(), DT); +} + +bool ParallelRegionInfo::maybeContainedInAny(const BasicBlock *BB, + const DominatorTree &DT) const { + return std::any_of( + TopLevelParallelRegions.begin(), TopLevelParallelRegions.end(), + [BB, &DT](ParallelRegion *PR) { return PR->mayContain(BB, DT); }); +} + +bool ParallelRegionInfo::maybeContainedInAny(const Instruction *I, + const DominatorTree &DT) const { + return maybeContainedInAny(I->getParent(), DT); +} + +bool ParallelRegionInfo::maybeContainedInAny(const Loop *L, + const DominatorTree &DT) const { + // See ParallelTask::contains(const Loop *, DominatorTree &) for reasoning. + return maybeContainedInAny(L->getHeader(), DT); +} + +bool ParallelRegionInfo::isSafeToPromote(const AllocaInst &AI, + const DominatorTree &DT) const { + + // First check if we know that AI is contained in a parallel region. + ParallelRegion *AIPR = nullptr; + if (AI.getParent() != &AI.getFunction()->getEntryBlock()) { + for (ParallelRegion *PR : TopLevelParallelRegions) { + // In order to not perform a potentially linear contains check we will + // skip parallel regions that have multi-exit tasks. This is conservative + // but sound. + if (!PR->hasTwoSingleExits()) + continue; + if (!PR->contains(&AI, DT)) + continue; + + AIPR = PR; + break; + } + } + + // If we know a parallel region that contains AI we determine the smallest one + // that does. If the smallest one does not have parallel sub-regions we can + // promote the alloca as it will only be alive inside one parallel thread of + // that region. If the smallest one does contain parallel sub-regions we bail + // for now as the alloca could be used in a parallel context. + while (AIPR) { + ParallelRegion::SubRegionMapTy SubRegions = AIPR->getSubRegions(); + if (SubRegions.empty()) + return true; + + ParallelRegion *SubAIPR = nullptr; + for (auto It : SubRegions) { + ParallelRegion *SR = It.second; + // In order to not perform a potentially linear contains check we will + // bail for parallel sub-regions that have multi-exit tasks. + if (!SR->hasTwoSingleExits()) + return false; + + if (!SR->contains(&AI, DT)) + continue; + + SubAIPR = SR; + break; + } + + if (!SubAIPR) + return false; + AIPR = SubAIPR; + } + + assert(!AIPR); + + // If we do not know that the alloca is inside a parallel task we will not + // allow promotion if any user might be in a parallel region. + // TODO Check only for spawned tasks not parallel regions. + for (const User *U : AI.users()) { + const Instruction *I = dyn_cast(U); + if (!I || !I->mayWriteToMemory()) + continue; + + if (maybeContainedInAny(I, DT)) + return false; + } + + return true; +} + +//===----------------------------------------------------------------------===// +// ParallelRegionAnalysis implementation +// + +AnalysisKey ParallelRegionAnalysis::Key; + +ParallelRegionInfo ParallelRegionAnalysis::run(Function &F, + FunctionAnalysisManager &AM) { + DominatorTree &DT = AM.getResult(F); + return ParallelRegionInfo(F, DT); +} + +//===----------------------------------------------------------------------===// +// ParallelRegionInfoPass implementation +// + +bool ParallelRegionInfoPass::runOnFunction(Function &F) { + auto &DT = getAnalysis().getDomTree(); + PRI.recalculate(F, DT); + return false; +} + +void ParallelRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); +} + +void ParallelRegionInfoPass::print(raw_ostream &OS, const Module *) const { + PRI.print(OS); +} + +void ParallelRegionInfoPass::verifyAnalysis() const { + // TODO Not implemented but merely a stub. +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void ParallelRegionInfoPass::dump() const { PRI.dump(); } +#endif + +char ParallelRegionInfoPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ParallelRegionInfoPass, "parallel-regions", + "Detect parallel regions", true, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(ParallelRegionInfoPass, "parallel-regions", + "Detect parallel regions", true, true) + +// Create methods available outside of this file, to use them +// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by +// the link time optimization. + +namespace llvm { +FunctionPass *createParallelRegionInfoPass() { + return new ParallelRegionInfoPass(); +} +} Index: test/PIR/basic_broken_1.ll =================================================================== --- /dev/null +++ test/PIR/basic_broken_1.ll @@ -0,0 +1,28 @@ +; RUN: not not opt -parallel-regions -analyze < %s 2> %t +; RUN: FileCheck --input-file %t %s +; REQUIRES: asserts +; CHECK: Parallel region fork does not dominate halt instruction + +declare void @foo(); + +define void @region_entered_without_fork1(i1 %cond) { +entry: + br i1 %cond, label %fork_entry, label %non_fork_entry + +non_fork_entry: + br label %forked + +fork_entry: + fork label %forked, %cont + +forked: + call void @foo() + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} Index: test/PIR/basic_broken_2.ll =================================================================== --- /dev/null +++ test/PIR/basic_broken_2.ll @@ -0,0 +1,28 @@ +; RUN: not not opt -parallel-regions -analyze < %s 2> %t +; RUN: FileCheck --input-file %t %s +; REQUIRES: asserts +; CHECK: Parallel region fork does not dominate join instruction + +declare void @foo(); + +define void @region_entered_without_fork2(i1 %cond) { +entry: + br i1 %cond, label %fork_entry, label %non_fork_entry + +non_fork_entry: + br label %cont + +fork_entry: + fork label %forked, %cont + +forked: + call void @foo() + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} Index: test/PIR/basic_broken_3.ll =================================================================== --- /dev/null +++ test/PIR/basic_broken_3.ll @@ -0,0 +1,28 @@ +; RUN: not not opt -parallel-regions -analyze < %s 2> %t +; RUN: FileCheck --input-file %t %s +; REQUIRES: asserts +; CHECK: A parallel region was not terminated + +declare void @foo(); + +define void @forked_task_may_not_reach_halt_1(i1 %cond) { +entry: + fork label %forked, %cont + +forked: + call void @foo() + br i1 %cond, label %halt_block, label %non_halt_block + +non_halt_block: + br label %join + +halt_block: + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} Index: test/PIR/basic_broken_4.ll =================================================================== --- /dev/null +++ test/PIR/basic_broken_4.ll @@ -0,0 +1,28 @@ +; RUN: not not opt -parallel-regions -analyze < %s 2> %t +; RUN: FileCheck --input-file %t %s +; REQUIRES: asserts +; CHECK: Basic block cannot belong to two different parallel tasks + +declare void @foo(); + +define void @forked_task_may_not_reach_halt_1(i1 %cond) { +entry: + fork label %forked, %cont + +forked: + call void @foo() + br i1 %cond, label %halt_block, label %non_halt_block + +non_halt_block: + br label %cont + +halt_block: + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} Index: test/PIR/basic_broken_5.ll =================================================================== --- /dev/null +++ test/PIR/basic_broken_5.ll @@ -0,0 +1,28 @@ +; RUN: not not opt -parallel-regions -analyze < %s 2> %t +; RUN: FileCheck --input-file %t %s +; REQUIRES: asserts +; CHECK: A parallel region was not terminated + +declare void @foo(); + +define void @forked_task_may_not_reach_halt_1(i1 %cond) { +entry: + fork label %forked, %cont + +forked: + call void @foo() + br i1 %cond, label %halt_block, label %non_halt_block + +non_halt_block: + ret void + +halt_block: + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} Index: test/PIR/parallel_region_info.ll =================================================================== --- /dev/null +++ test/PIR/parallel_region_info.ll @@ -0,0 +1,78 @@ +; RUN: opt -parallel-regions -analyze < %s | FileCheck %s + +; CHECK: Printing analysis 'Detect parallel regions' for function 'simple': +; CHECK: Parallel region: +; CHECK: - fork label %forked, %cont +; CHECK: Forked Task: +; CHECK: - Begin: forked +; CHECK: - End: halt label %cont +; CHECK: Continuation Task: +; CHECK: - Begin: cont +; CHECK: - End: join label %join + +declare void @foo(); + +define void @simple() { +entry: + fork label %forked, %cont + +forked: + call void @foo() + halt label %cont + +cont: + call void @foo() + join label %join + +join: + ret void +} + +; CHECK: Printing analysis 'Detect parallel regions' for function 'nested': +; CHECK: Parallel region: +; CHECK: - fork label %forked.outer, %cont.outer +; CHECK: Forked Task: +; CHECK: - Begin: forked.outer +; CHECK: Parallel region: +; CHECK: - fork label %forked.inner, %cont.inner +; CHECK: Forked Task: +; CHECK: - Begin: forked.inner +; CHECK: - End: halt label %cont.inner +; CHECK: Continuation Task: +; CHECK: - Begin: cont.inner +; CHECK: - End: join label %forked.outer.end +; CHECK: - End: halt label %cont.outer +; CHECK: Continuation Task: +; CHECK: - Begin: cont.outer +; CHECK: - End: join label %join + +define void @nested() { +entry: + fork label %forked.outer, %cont.outer + +forked.outer: + call void @foo() + fork label %forked.inner, %cont.inner + +forked.inner: + call void @foo() + br label %forked.inner.body + +forked.inner.body: + call void @foo() + halt label %cont.inner + +cont.inner: + call void @foo() + join label %forked.outer.end + +forked.outer.end: + halt label %cont.outer + +cont.outer: + call void @foo() + join label %join + +join: + ret void +}