Index: include/llvm/Analysis/ParallelRegionInfo.h =================================================================== --- include/llvm/Analysis/ParallelRegionInfo.h +++ include/llvm/Analysis/ParallelRegionInfo.h @@ -45,9 +45,20 @@ /// @TODO The update interface to add/remove parallel regions is missing. /// class ParallelRegionInfo { +public: + /// The container type used to store top level parallel regions. + using ParallelRegionVectorTy = SmallVector; + + /// Iterator types for top level parallel regions. + /// + ///{ + using iterator = ParallelRegionVectorTy::iterator; + using const_iterator = ParallelRegionVectorTy::const_iterator; + ///} +private: /// The parallel regions that are not nested in other parallel regions. - SmallVector TopLevelParallelRegions; + ParallelRegionVectorTy TopLevelParallelRegions; public: ParallelRegionInfo() {} @@ -71,14 +82,60 @@ /// Return the top-level parallel regions in this function. /// ///{ - SmallVectorImpl &getTopLevelParallelRegions() { + ParallelRegionVectorTy &getTopLevelParallelRegions() { return TopLevelParallelRegions; } - const SmallVectorImpl &getTopLevelParallelRegions() const { + const ParallelRegionVectorTy &getTopLevelParallelRegions() const { return TopLevelParallelRegions; } ///} + /// Return true if there is no parallel region in this function. + bool empty() const { return TopLevelParallelRegions.empty(); } + + /// Remove all cached parallel regions of this function. + void clear() { TopLevelParallelRegions.clear(); } + + /// Return the number of top level parallel regions in this function. + ParallelRegionVectorTy::size_type size() const { + return TopLevelParallelRegions.size(); + } + + /// Iterators for parallel regions. + /// + ///{ + iterator begin() { return TopLevelParallelRegions.begin(); } + iterator end() { return TopLevelParallelRegions.end(); } + const_iterator begin() const { return TopLevelParallelRegions.begin(); } + const_iterator end() const { return TopLevelParallelRegions.end(); } + ///} + + /// Fill @p Container with all parallel regions (not only top level). + void getAllParallelRegions(ParallelRegionVectorTy &Container) const; + + /// Return the parallel region that makes @p L a parallel loop, if any. + /// + /// A parallel loop is a loop that does fork (parts of) its body to a new + /// task which are joined only after the loop. It is also ensured that + /// everything that is not forked but part of the loop does not cause + /// side-effects. These instructions usually compute the exit condition + /// and the new values for the induction variable(s). + /// + /// Note: We allow multiple induction variables and complex exit conditions + /// here. However, for some parallel runtimes these have to be simplified, + /// e.g. expressed with regards to one canonical induction variable. + /// TODO: Provide an interface to perform the simplification. + /// + /// Note: We allow arbitrary code after the loop but before a join + /// instruction. + ParallelRegion *getParallelLoopRegion(const Loop &L, + const DominatorTree &DT) const; + + /// Return true if @p L is a parallel loop. + /// + /// See ParallelRegionInfo::getParallelLoopRegion for more information. + bool isParallelLoop(const Loop &L, const DominatorTree &DT) const; + /// Check for containment in any parallel region. ///{ bool containedInAny(const BasicBlock *BB, const DominatorTree &DT) const; @@ -308,6 +365,11 @@ } ///} + /// Return true if this is a parallel loop region for @p L. + /// + /// See ParallelRegionInfo::getParallelLoopRegion for more information. + bool isParallelLoopRegion(const Loop &L, const DominatorTree &DT) const; + /// @see ParallelTask::contains(const BasicBlock *BB, DominatorTree &DT) bool contains(const BasicBlock *BB, const DominatorTree &DT) const; Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -58,6 +58,9 @@ /// Initialize all passes linked into the CodeGen library. void initializeTarget(PassRegistry&); +/// Initialize all passes linked into the PIR library. +void initializePIROpts(PassRegistry&); + void initializeAAEvalLegacyPassPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry &); void initializeADCELegacyPassPass(PassRegistry&); @@ -319,6 +322,7 @@ void initializeScalarizerPass(PassRegistry&); void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&); void initializeSeparateConstOffsetFromGEPPass(PassRegistry &); +void initializeSequentializePIRPass(PassRegistry&); void initializeShadowStackGCLoweringPass(PassRegistry&); void initializeShrinkWrapPass(PassRegistry &); void initializeSimpleInlinerPass(PassRegistry&); Index: include/llvm/Transforms/Scalar/SROA.h =================================================================== --- include/llvm/Transforms/Scalar/SROA.h +++ include/llvm/Transforms/Scalar/SROA.h @@ -26,6 +26,8 @@ namespace llvm { +class ParallelRegionInfo; + /// A private "module" namespace for types and utilities used by SROA. These /// are implementation details and should not be used by clients. namespace sroa LLVM_LIBRARY_VISIBILITY { @@ -59,6 +61,7 @@ LLVMContext *C = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; + ParallelRegionInfo *PRI = nullptr; /// \brief Worklist of alloca instructions to simplify. /// @@ -114,7 +117,7 @@ /// Helper used by both the public run method and by the legacy pass. PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC); + AssumptionCache &RunAC, ParallelRegionInfo &RunPRI); bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, Index: include/llvm/Transforms/Utils/PromoteMemToReg.h =================================================================== --- include/llvm/Transforms/Utils/PromoteMemToReg.h +++ include/llvm/Transforms/Utils/PromoteMemToReg.h @@ -22,6 +22,7 @@ class DominatorTree; class AliasSetTracker; class AssumptionCache; +class ParallelRegionInfo; /// \brief Return true if this alloca is legal for promotion. /// @@ -29,7 +30,12 @@ /// (transitively) using this alloca. This also enforces that there is only /// ever one layer of bitcasts or GEPs between the alloca and the lifetime /// markers. -bool isAllocaPromotable(const AllocaInst *AI); +/// +/// In case the dominance tree and the parallel region info are _both_ given +/// we also verify that promoting the alloca does not break invariants of the +/// parallel regions. See ParallelRegionInfo::isSafeToPromote(*) for details. +bool isAllocaPromotable(const AllocaInst *AI, const DominatorTree *DT = nullptr, + const ParallelRegionInfo *PRI = nullptr); /// \brief Promote the specified list of alloca instructions into scalar /// registers, inserting PHI nodes as appropriate. Index: lib/Analysis/ParallelRegionInfo.cpp =================================================================== --- lib/Analysis/ParallelRegionInfo.cpp +++ lib/Analysis/ParallelRegionInfo.cpp @@ -182,6 +182,72 @@ ParallelSubRegions[&ParallelSubRegion.getFork()] = &ParallelSubRegion; } +bool ParallelRegion::isParallelLoopRegion(const Loop &L, + const DominatorTree &DT) const { + // Bail if the fork is not part of the loop. + if (!L.contains(getFork().getParent())) + return false; + + // Bail if a join is part of the loop. + const auto &JoinsVector = ContinuationTask.getHaltsOrJoints(); + if (std::any_of(JoinsVector.begin(), JoinsVector.end(), + [&L](TerminatorInst *TI) { return L.contains(TI); })) + return false; + + // Now check for possible side effects outside the forked task. First the + // header to the fork and then the part of the continuation which is inside + // the loop. + SmallVector Worklist; + BasicBlock *HeaderBB = L.getHeader(); + Worklist.push_back(HeaderBB); + + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + assert(L.contains(BB)); + + for (Instruction &I : *BB) + if (I.mayHaveSideEffects()) + return false; + + if (&getFork() == BB->getTerminator()) + continue; + + for (BasicBlock *SuccessorBB : successors(BB)) + if (L.contains(SuccessorBB)) + Worklist.push_back(SuccessorBB); + } + + BasicBlock &ContinuationEntryBB = ContinuationTask.getEntry(); + assert(L.contains(&ContinuationEntryBB) && + "Fork instructions should not be used to exit a loop!"); + + // We do not support nested loops in the continuation part. + if (std::any_of(L.begin(), L.end(), [&](Loop *SubLoop) { + return ContinuationTask.contains(SubLoop, DT); + })) + return false; + + // Traverse all blocks that are in the continuation and in the loop and check + // for side-effects. + assert(Worklist.empty()); + Worklist.push_back(&ContinuationEntryBB); + + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.pop_back_val(); + assert(L.contains(BB)); + + for (Instruction &I : *BB) + if (I.mayHaveSideEffects()) + return false; + + for (BasicBlock *SuccessorBB : successors(BB)) + if (L.contains(SuccessorBB) && SuccessorBB != HeaderBB) + Worklist.push_back(SuccessorBB); + } + + return true; +} + bool ParallelRegion::contains(const BasicBlock *BB, const DominatorTree &DT) const { // All contained blocks are dominated by the fork block. @@ -430,6 +496,59 @@ return BB2PTMap; } +void ParallelRegionInfo::getAllParallelRegions( + ParallelRegionVectorTy &Container) const { + + Container.append(begin(), end()); + for (unsigned i = 0; i < Container.size(); i++) { + ParallelRegion *PR = Container[i]; + + for (const auto &It : PR->getSubRegions()) + Container.push_back(It.getSecond()); + } +} + +ParallelRegion *ParallelRegionInfo::getParallelLoopRegion(const Loop &L, + const DominatorTree &DT) const { + if (empty()) + return nullptr; + + SmallVector ParallelRegions; + ParallelRegions.append(begin(), end()); + + // The parallel region we are looking for. + ParallelRegion *ParallelLoopRegion = nullptr; + + while (!ParallelRegions.empty()) { + ParallelRegion *PR = ParallelRegions.pop_back_val(); + + for (const auto &It : PR->getSubRegions()) + ParallelRegions.push_back(It.getSecond()); + + if (!PR->isParallelLoopRegion(L, DT)) + continue; + + assert(!ParallelLoopRegion && + "Broken nesting resulted in multiple parallel loop regions"); + + ParallelLoopRegion = PR; + +#ifdef NDEBUG + // For debug builds we verify that at most one parallel loop region exists, + // otherwise we just assume the nesting was intact. + break; +#endif + } + + return ParallelLoopRegion; +} + +bool ParallelRegionInfo::isParallelLoop(const Loop &L, + const DominatorTree &DT) const { + // See ParallelRegionInfo::getParallelLoopRegion(...) for more information. + return getParallelLoopRegion(L, DT) != nullptr; +} + bool ParallelRegionInfo::containedInAny(const BasicBlock *BB, const DominatorTree &DT) const { return std::any_of( @@ -468,6 +587,8 @@ bool ParallelRegionInfo::isSafeToPromote(const AllocaInst &AI, const DominatorTree &DT) const { + if (empty()) + return true; // First check if we know that AI is contained in a parallel region. ParallelRegion *AIPR = nullptr; Index: lib/Passes/LLVMBuild.txt =================================================================== --- lib/Passes/LLVMBuild.txt +++ lib/Passes/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Passes parent = Libraries -required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation PIR Index: lib/Transforms/CMakeLists.txt =================================================================== --- lib/Transforms/CMakeLists.txt +++ lib/Transforms/CMakeLists.txt @@ -5,5 +5,6 @@ add_subdirectory(IPO) add_subdirectory(Vectorize) add_subdirectory(Hello) +add_subdirectory(PIR) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) Index: lib/Transforms/LLVMBuild.txt =================================================================== --- lib/Transforms/LLVMBuild.txt +++ lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC +subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC PIR [component_0] type = Group Index: lib/Transforms/PIR/CMakeLists.txt =================================================================== --- /dev/null +++ lib/Transforms/PIR/CMakeLists.txt @@ -0,0 +1,10 @@ +add_llvm_library(LLVMPIR + SequentializePIR.cpp + PIR.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + + DEPENDS + intrinsics_gen + ) Index: lib/Transforms/PIR/LLVMBuild.txt =================================================================== --- lib/Transforms/PIR/LLVMBuild.txt +++ lib/Transforms/PIR/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Passes/LLVMBuild.txt -------------------------------*- Conf -*--===; +;===- ./lib/Transforms/PIR/LLVMBuild.txt -----------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,6 +17,6 @@ [component_0] type = Library -name = Passes -parent = Libraries -required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation +name = PIR +parent = Transforms +required_libraries = Analysis BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation Index: lib/Transforms/PIR/PIR.cpp =================================================================== --- /dev/null +++ lib/Transforms/PIR/PIR.cpp @@ -0,0 +1,16 @@ +//===-- PIR.cpp -----------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/InitializePasses.h" + +using namespace llvm; + +void llvm::initializePIROpts(PassRegistry &Registry) { + initializeSequentializePIRPass(Registry); +} Index: lib/Transforms/PIR/SequentializePIR.cpp =================================================================== --- /dev/null +++ lib/Transforms/PIR/SequentializePIR.cpp @@ -0,0 +1,156 @@ +//===-- PIR/SequentializePIR.cpp - Sequentialize parallel regions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass will lower all parallel regions to sequential IR. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ParallelRegionInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +/// This pass will lower all parallel regions to sequential IR. It replaces +/// PIR instruction by unconditional branch instructions as follows: +/// - fork is replaced with a branch to the forked block. +/// - halt is replaced with a branch to the sibling continuation block. +/// - join is replaced with a branch to its destination block. +/// +/// If a parallel loop is encountered we annotate it as parallel using metadata. +struct SequentializePIR : public FunctionPass { + static char ID; + + SequentializePIR() : FunctionPass(ID) { + initializeSequentializePIRPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + + ParallelTask::VisitorTy createParallelTaskAnnotator(MDNode *LoopID) { + assert(LoopID); + + // Helper function that annotates all read/write instructions in a basic + // block with parallel memory metadata referring to LoopID. + ParallelTask::VisitorTy Annotator = [LoopID](BasicBlock &BB, + const ParallelTask &) { + for (Instruction &I : BB) { + if (!I.mayReadOrWriteMemory()) + continue; + + MDNode *LoopIdMD = + I.getMetadata(LLVMContext::MD_mem_parallel_loop_access); + if (!LoopIdMD) + LoopIdMD = MDNode::get(I.getContext(), {LoopID}); + else + LoopIdMD = MDNode::concatenate(LoopIdMD, LoopID); + I.setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopIdMD); + } + + return true; + }; + + return Annotator; + } + + bool runOnFunction(Function &F) override { + ParallelRegionInfo &PRI = + getAnalysis().getParallelRegionInfo(); + + // Exit early if there is nothing to do. + if (PRI.empty()) + return false; + + LoopInfo &LI = getAnalysis().getLoopInfo(); + const DominatorTree &DT = + getAnalysis().getDomTree(); + + ParallelRegionInfo::ParallelRegionVectorTy ParallelRegions; + PRI.getAllParallelRegions(ParallelRegions); + + for (ParallelRegion *PR : ParallelRegions) { + Loop *L = LI.getLoopFor(PR->getFork().getParent()); + bool AnnotateAsParallel = + (L && !L->isAnnotatedParallel() && PR->isParallelLoopRegion(*L, DT)); + + // Replace the fork, join and halt instructions by branches. Note that + // this does not change the dominator tree or loop info. + ForkInst &Fork = PR->getFork(); + BranchInst::Create(Fork.getForkedBB(), &Fork); + Fork.eraseFromParent(); + + ParallelTask &ForkedTask = PR->getForkedTask(); + for (TerminatorInst *TI : ForkedTask.getHaltsOrJoints()) { + assert(isa(TI) && "Forked task was not terminated by a halt!"); + BranchInst::Create(TI->getSuccessor(0), TI); + TI->eraseFromParent(); + } + + ParallelTask &ContinuationTask = PR->getContinuationTask(); + for (TerminatorInst *TI : ContinuationTask.getHaltsOrJoints()) { + assert(isa(TI) && + "Continuation task was not terminated by a join!"); + BranchInst::Create(TI->getSuccessor(0), TI); + TI->eraseFromParent(); + } + + if (!AnnotateAsParallel) + continue; + + // Get or create a loop id for the current loop. + MDNode *LoopID = L->getLoopID(); + if (!LoopID) { + LoopID = MDNode::get(F.getContext(), {nullptr}); + LoopID->replaceOperandWith(0, LoopID); + L->setLoopID(LoopID); + } + + // Annotate all blocks in the forked task with + // llvm.mem.parallel_loop_access metadata. Note that no other side-effects + // are allowed in the loop outside the forked task. + ParallelTask::VisitorTy Annotator = createParallelTaskAnnotator(LoopID); + ForkedTask.visit(Annotator); + + // Verify that (for simplified loops) the annotation is recognized. + assert(!L->isLoopSimplifyForm() || L->isAnnotatedParallel()); + } + + // Update the parallel region info. + PRI.clear(); + assert(PRI.empty()); + + return true; + } +}; +} + +char SequentializePIR::ID = 0; + +INITIALIZE_PASS_BEGIN(SequentializePIR, "sequentialize-pir", + "Lower parallel regions to sequential IR", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SequentializePIR, "sequentialize-pir", + "Lower parallel regions to sequential IR", false, false) Index: lib/Transforms/Scalar/SROA.cpp =================================================================== --- lib/Transforms/Scalar/SROA.cpp +++ lib/Transforms/Scalar/SROA.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ParallelRegionInfo.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -4082,6 +4083,13 @@ AI.eraseFromParent(); return true; } + + // For now we do not perform SROA if it might break a parallel region. + // TODO Perform this check on the slices and promote the ones that are safe to + // promote. + if (!PRI->isSafeToPromote(AI, *DT)) + return false; + const DataLayout &DL = AI.getModule()->getDataLayout(); // Skip alloca forms that this analysis can't handle. @@ -4191,11 +4199,13 @@ } PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC) { + AssumptionCache &RunAC, + ParallelRegionInfo &RunPRI) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; + PRI = &RunPRI; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4243,7 +4253,8 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { return runImpl(F, AM.getResult(F), - AM.getResult(F)); + AM.getResult(F), + AM.getResult(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. @@ -4264,12 +4275,14 @@ auto PA = Impl.runImpl( F, getAnalysis().getDomTree(), - getAnalysis().getAssumptionCache(F)); + getAnalysis().getAssumptionCache(F), + getAnalysis().getParallelRegionInfo()); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -4286,5 +4299,6 @@ "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass) INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) Index: lib/Transforms/Utils/Mem2Reg.cpp =================================================================== --- lib/Transforms/Utils/Mem2Reg.cpp +++ lib/Transforms/Utils/Mem2Reg.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/Mem2Reg.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ParallelRegionInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -28,7 +29,8 @@ STATISTIC(NumPromoted, "Number of alloca's promoted"); static bool promoteMemoryToRegister(Function &F, DominatorTree &DT, - AssumptionCache &AC) { + AssumptionCache &AC, + ParallelRegionInfo &PRI) { std::vector Allocas; BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function bool Changed = false; @@ -40,7 +42,7 @@ // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? - if (isAllocaPromotable(AI)) + if (isAllocaPromotable(AI, &DT, &PRI)) Allocas.push_back(AI); if (Allocas.empty()) @@ -56,7 +58,8 @@ PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &AC = AM.getResult(F); - if (!promoteMemoryToRegister(F, DT, AC)) + auto &PRI = AM.getResult(F); + if (!promoteMemoryToRegister(F, DT, AC, PRI)) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -81,12 +84,15 @@ DominatorTree &DT = getAnalysis().getDomTree(); AssumptionCache &AC = getAnalysis().getAssumptionCache(F); - return promoteMemoryToRegister(F, DT, AC); + ParallelRegionInfo &PRI = + getAnalysis().getParallelRegionInfo(); + return promoteMemoryToRegister(F, DT, AC, PRI); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } }; @@ -98,6 +104,7 @@ false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ParallelRegionInfoPass) INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register", false, false) Index: lib/Transforms/Utils/PromoteMemoryToRegister.cpp =================================================================== --- lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/IteratedDominanceFrontier.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/ParallelRegionInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" @@ -48,7 +49,13 @@ STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -bool llvm::isAllocaPromotable(const AllocaInst *AI) { +bool llvm::isAllocaPromotable(const AllocaInst *AI, const DominatorTree *DT, + const ParallelRegionInfo *PRI) { + // If the dominator tree and parallel region info were given we check if the + // promotion would break any parallel region requirements. + if (PRI && DT && !PRI->isSafeToPromote(*AI, *DT)) + return false; + // FIXME: If the memory unit is of pointer or integer type, we can permit // assignments to subsections of the memory unit. unsigned AS = AI->getType()->getAddressSpace(); Index: test/PIR/alloca_promotion.ll =================================================================== --- /dev/null +++ test/PIR/alloca_promotion.ll @@ -0,0 +1,77 @@ +; RUN: opt -mem2reg -sroa -S < %s | FileCheck %s + +declare void @foo(); +declare void @bar(i32); + +; Verify we do not promote allocas that are used inside and outside a +; parallel region. +define i32 @alloca_used_in_seq_and_par_code() { +entry: +; CHECK: %local_alloca = alloca i32 + %local_alloca = alloca i32 + fork label %forked, %cont + +forked: ; preds = %entry +; CHECK: store i32 0, i32* %local_alloca + store i32 0, i32* %local_alloca + call void @foo() + halt label %cont + +cont: ; preds = %entry, %forked + call void @foo() +; CHECK: store i32 1, i32* %local_alloca + store i32 1, i32* %local_alloca + join label %join + +join: ; preds = %cont +; CHECK: %val = load i32, i32* %local_alloca + %val = load i32, i32* %local_alloca + ret i32 %val +} + +; Verify we do not promote allocas even if they are used only inside a parallel +; region but defined outside. +define i32 @alloca_used_only_in_par_code() { +entry: +; CHECK: alloca i32 + %local_alloca = alloca i32 + fork label %forked, %cont + +forked: ; preds = %entry + store i32 0, i32* %local_alloca + call void @foo() + halt label %cont + +cont: ; preds = %entry, %forked + call void @foo() + store i32 1, i32* %local_alloca + %val = load i32, i32* %local_alloca + join label %join + +join: ; preds = %cont +; CHECK: ret i32 %val + ret i32 %val +} + +; Verify we do promote allocas that are used only outside a parallel region. +define i32 @alloca_used_only_in_seq_code() { +entry: +; CHECK-NOT: alloca i32 + %local_alloca = alloca i32 + store i32 0, i32* %local_alloca + fork label %forked, %cont + +forked: ; preds = %entry + call void @foo() + halt label %cont + +cont: ; preds = %entry, %forked + call void @foo() + %val = load i32, i32* %local_alloca + join label %join + +join: ; preds = %cont + store i32 1, i32* %local_alloca +; CHECK: ret i32 0 + ret i32 %val +} Index: test/PIR/lower.ll =================================================================== --- /dev/null +++ test/PIR/lower.ll @@ -0,0 +1,78 @@ +; RUN: opt -sequentialize-pir -S < %s | FileCheck %s + +declare void @foo(); + +define void @simple() { +entry: + fork label %forked, %cont +; CHECK: entry +; CHECK-NEXT: br label %forked + +forked: + call void @foo() + halt label %cont +; CHECK: forked +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %cont + +cont: + call void @foo() + join label %join +; CHECK: cont +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %join + +join: + ret void +} + + +define void @nested() { +entry: + fork label %forked.outer, %cont.outer +; CHECK: entry +; CHECK-NEXT: br label %forked.outer + +forked.outer: + call void @foo() + fork label %forked.inner, %cont.inner +; CHECK: forked.outer +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %forked.inner + +forked.inner: + call void @foo() + br label %forked.inner.body +; CHECK: forked.inner +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %forked.inner.body + +forked.inner.body: + call void @foo() + halt label %cont.inner +; CHECK: forked.inner.body +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %cont.inner + +cont.inner: + call void @foo() + join label %forked.outer.end +; CHECK: cont.inner +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %forked.outer.end + +forked.outer.end: + halt label %cont.outer +; CHECK: forked.outer.end +; CHECK-NEXT: br label %cont.outer + +cont.outer: + call void @foo() + join label %join +; CHECK: cont +; CHECK-NEXT: call void @foo +; CHECK-NEXT: br label %join + +join: + ret void +} Index: test/PIR/parallel_loops.ll =================================================================== --- /dev/null +++ test/PIR/parallel_loops.ll @@ -0,0 +1,288 @@ +; RUN: opt -sequentialize-pir -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; void f0(int *A, int N) { +; int i = 0; +; #pragma parallel +; do { +; A[i] += 1; +; } while (i++ < N); +; } +; +; Function Attrs: noinline nounwind uwtable +define void @f0(i32* %A, i32 %N) #0 { +entry: + %tmp = sext i32 %N to i64 + br label %do.body + +do.body: ; preds = %do.cond, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ 0, %entry ] + br label %pr.begin + +pr.begin: + fork label %forked, %do.cond + +forked: + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %arrayidx, align 4 + halt label %do.cond + +; CHECK: forked: +; CHECK: %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID0:[0-9]*]] +; CHECK: store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID0]] +; CHECK: br label %do.cond + +do.cond: ; preds = %do.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %do.body, label %do.end + +; CHECK: do.cond: +; CHECK: br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L0:[0-9]*]] + +do.end: ; preds = %do.cond + join label %return + +return: + ret void +} + +; void f1(int *A, int N) { +; int i = N; +; #pragma parallel +; do { +; A[i] += 1; +; } while (i-- >= 0); +; } +; +; Function Attrs: noinline nounwind uwtable +define void @f1(i32* %A, i32 %N) #0 { +entry: + %tmp = sext i32 %N to i64 + br label %do.body + +do.body: ; preds = %do.cond, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %entry ] + br label %pr.begin + +pr.begin: + fork label %forked, %do.cond + +forked: + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %arrayidx, align 4 + halt label %do.cond + +; CHECK: forked: +; CHECK: %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID1:[0-9]*]] +; CHECK: store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID1]] +; CHECK: br label %do.cond + +do.cond: ; preds = %do.body + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, -1 + +; CHECK: do.cond: +; CHECK: br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L1:[0-9]*]] + + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.cond + join label %return + +return: + ret void +} + +; void f2(int *A, int N) { +; int i = 0; +; #pragma parallel +; do { +; A[i] += 1; +; } while (i++ < N); +; } +; +; Function Attrs: noinline nounwind uwtable +define void @f2(i32* %A, i32 %N) #0 { +entry: + %tmp = sext i32 %N to i64 + br label %do.body + +do.body: ; preds = %do.cond, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ 0, %entry ] + br label %pr.begin + +pr.begin: ; preds = %do.body + fork label %forked, %do.cond + +forked: ; preds = %pr.begin + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0 + %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0 + halt label %do.cond + +; CHECK: forked: +; CHECK: %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID2:[0-9]*]] +; CHECK: store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID2]] +; CHECK: br label %do.cond + +do.cond: ; preds = %forked + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %do.body, label %do.end, !llvm.loop !1 + +; CHECK: do.cond: +; CHECK: br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L2:[0-9]*]] + +do.end: ; preds = %do.cond + join label %return + +return: ; preds = %do.end + ret void +} + +; void f3(int *A, int N) { +; int i = N; +; #pragma parallel +; do { +; A[i] += 1; +; } while (i-- >= 0); +; } +; +; CHECK: @f3 +; +; Function Attrs: noinline nounwind uwtable +define void @f3(i32* %A, i32 %N) #0 { +entry: + %tmp = sext i32 %N to i64 + br label %do.body + +do.body: ; preds = %do.cond, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %entry ] + br label %pr.begin + +pr.begin: ; preds = %do.body + fork label %forked, %do.cond + +; CHECK: pr.begin: +; CHECK-NEXT: br label %forked + +forked: ; preds = %pr.begin + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !2 + %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !2 + halt label %do.cond + +; CHECK: forked: +; CHECK: %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID3:[0-9]*]] +; CHECK: store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID3]] +; CHECK: br label %do.cond + +do.cond: ; preds = %forked + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, -1 + br i1 %cmp, label %do.body, label %do.end, !llvm.loop !3 + +; CHECK: do.cond: +; CHECK: br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[L3:[0-9]*]] + +do.end: ; preds = %do.cond + join label %return + +return: ; preds = %do.end + ret void +} + +; void f4(int *A, int N) { +; int i = N; +; do { +; #pragma parallel +; do { +; A[i] += 1; +; } while (i-- >= 0); +; } while (0); +; } +; +; Function Attrs: noinline nounwind uwtable +; +; CHECK: @f4 +define void @f4(i32* %A, i32 %N) #0 { +entry: + %tmp = sext i32 %N to i64 + br label %outer.do.body + +outer.do.body: + br label %do.body + +do.body: ; preds = %do.cond, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %do.cond ], [ %tmp, %outer.do.body ] + br label %pr.begin + +pr.begin: ; preds = %do.body + fork label %forked, %do.cond + +forked: ; preds = %pr.begin + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !4 + %add = add nsw i32 %tmp1, 1 + store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !4 + halt label %do.cond + +; CHECK: %tmp1 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID4:[0-9]*]] +; CHECK: store i32 %add, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access ![[PID4]] + +do.cond: ; preds = %forked + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, -1 + br i1 %cmp, label %do.body, label %do.end + +; CHECK: do.cond +; CHECK: br i1 %cmp, label %do.body, label %do.end, !llvm.loop ![[LInner4:[0-9]*]] + +do.end: ; preds = %do.cond + join label %outer.do.cond + +outer.do.cond: + br i1 false, label %outer.do.body, label %outer.do.end, !llvm.loop !5 + +; CHECK: outer.do.cond +; CHECK: br i1 false, label %outer.do.body, label %outer.do.end, !llvm.loop ![[LOuter4:[0-9]*]] + +outer.do.end: + br label %return + +return: ; preds = %do.end + ret void +} + +!0 = !{!1} +!1 = distinct !{!1} +!2 = !{!3} +!3 = distinct !{!3} +!4 = !{!5} +!5 = distinct !{!5} + +; CHECK: ![[PID0]] = !{![[L0]]} +; CHECK: ![[L0]] = distinct !{![[L0]]} + +; CHECK: ![[PID1]] = !{![[L1]]} +; CHECK: ![[L1]] = distinct !{![[L1]]} + +; CHECK: ![[PID2]] = !{![[L2]]} +; CHECK: ![[L2]] = distinct !{![[L2]]} + +; CHECK: ![[PID3]] = !{![[L3]]} +; CHECK: ![[L3]] = distinct !{![[L3]]} + +; CHECK: ![[PID4]] = !{![[LOuter4]], ![[LInner4]]} +; CHECK: ![[LOuter4]] = distinct !{![[LOuter4]]} +; CHECK: ![[LInner4]] = distinct !{![[LInner4]]} + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: tools/opt/CMakeLists.txt =================================================================== --- tools/opt/CMakeLists.txt +++ tools/opt/CMakeLists.txt @@ -6,6 +6,7 @@ Core Coroutines IPO + PIR IRReader InstCombine Instrumentation Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -378,6 +378,7 @@ initializeInstCombine(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); + initializePIROpts(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. initializeCodeGenPreparePass(Registry);