Index: include/llvm/Analysis/ParallelIR/KMPCImpl.h =================================================================== --- /dev/null +++ include/llvm/Analysis/ParallelIR/KMPCImpl.h @@ -0,0 +1,198 @@ +//===- ParallelIR/KMPCImpl.h - KMPC parallel region impl. -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface of the parallel regions for the OpenMP KMPC runtime library call +// representation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PARALLELIR_KMPCIMPL_H +#define LLVM_ANALYSIS_PARALLELIR_KMPCIMPL_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ParallelIR/RegionInfo.h" +#include "llvm/IR/Instructions.h" + +namespace llvm { + +class DominatorTree; +class KMPC_ParallelRegion; + +/// See @p ParallelIRCommunicationInfo +class KMPC_CommunicationInfo : public ParallelIRCommunicationInfo { + + /// Backwards reference to the parallel region for which this communication + /// interface was created. + const KMPC_ParallelRegion &PR; + + KMPC_CommunicationInfo(const KMPC_ParallelRegion &PR) : PR(PR) {} + +public: + + /// Return the runtime library call that initiates the communication. + CallInst &getRTCall() const; + + /// See @p ParallelIRCommunicationInfo::getAllCommunicatingParallelRegions + virtual bool getAllCommunicatingParallelRegions( + SmallVectorImpl &CommunicatingParallelRegions) + const override; + + /// See @p ParallelIRCommunicationInfo::getNumCommunicatedValues + virtual unsigned getNumCommunicatedValues() const override; + + /// See @p ParallelIRCommunicationInfo::getCommunicatedValue + virtual Value *getCommunicatedValue(unsigned Idx) const override; + + /// See @p ParallelIRCommunicationInfo::getCommunicationKind + virtual CommunicationKind getCommunicationKind(unsigned Idx) const override; + + /// See @p ParallelIRCommunicationInfo::getCommunicatedValues( + virtual void getCommunicatedValues( + SmallVectorImpl &CommunicatedValues) const override; + + /// See @p ParallelIRCommunicationInfo::getCommunicatedValueInParallelRegion + virtual Value * + getCommunicatedValueInParallelRegion(unsigned Idx) const override; + + /// See @p ParallelIRCommunicationInfo::hasAnnotatableCommunication + virtual bool hasAnnotatableCommunication() const override; + + /// See @p ParallelIRCommunicationInfo::hasAttributeInParallelRegion + virtual bool + hasAttributeInParallelRegion(unsigned Idx, + Attribute::AttrKind Kind) const override; + + friend class KMPC_ParallelRegion; +}; + +/// Specialization of the ParallelRegion interface for the OpenMP KMPC runtime +/// library representation. +/// +/// Note: This class is abstract as well. It collects the communalities between +/// KMPC_ForkParallelRegion and KMPC_TaskParallelRegion defined below. +/// +/// See @p ParallelRegion +class KMPC_ParallelRegion : public ParallelRegion { + + /// The subfunction that contains the parallel code. + Function &ParallelSubFn; + + /// The communication info object for this parallel region. + KMPC_CommunicationInfo CommunicationInfo; + +public: + KMPC_ParallelRegion(CallInst &KMPC_CI, Function &ParallelSubFn, + ParallelIRRegionInfo &PRI) + : ParallelRegion(KMPC_CI, KMPC_CI, PRI), ParallelSubFn(ParallelSubFn), + CommunicationInfo(*this) {} + + /// Return the parallel subfunction for this parallel region. + /// + /// Note that there might be multiple regions sharing the same parallel + /// subfunction. + /// See @p ParallelIRCommunicationInfo::getAllCommunicatingParallelRegions + Function &getParallelSubFn() const { return ParallelSubFn; } + + /// See @p ParallelRegion::getFirstInsertionPoint + virtual Instruction &getFirstInsertionPoint() const override; + + /// See @p ParallelRegion::getSequentialCodeFunction + virtual Function &getSequentialCodeFunction() const override; + + /// See @p ParallelRegion::getParallelCodeFunction + virtual Function &getParallelCodeFunction() const override; + + /// See @p ParallelRegion::getDefiniteBarriers + virtual void getDefiniteBarriers( + SmallVectorImpl &DefiniteBarriers) const override; + + /// See @p ParallelRegion::getPotentialBarriers + virtual void getPotentialBarriers( + SmallVectorImpl &PotentialBarriers) const override; + + /// See @p ParallelRegion::getThreadId + virtual Value *getThreadId() const override; + + /// See @p ParallelRegion::getLocalThreadId + virtual Value *getLocalThreadId() const override; + + /// See @p ParallelRegion::contains + virtual bool contains(const BasicBlock *BB, + const DominatorTree *) const override; + + /// See @p ParallelRegion::contains + virtual bool contains(const Instruction *I, + const DominatorTree *) const override; + + /// See @p ParallelRegion::visit + virtual bool visit(InstructionVisitorTy &Visitor) const override; + + /// See @p ParallelRegion::visit + virtual bool visit(BlockVisitorTy &Visitor) const override; + + /// See @p ParallelRegion::print + virtual void print(raw_ostream &OS, unsigned indent) const override; + + /// See @p ParallelRegion::getCommunicationInfo + virtual const ParallelIRCommunicationInfo & + getCommunicationInfo() const override { + return CommunicationInfo; + } +}; + +/// See @p KMPC_ParallelRegion +class KMPC_ForkParallelRegion : public KMPC_ParallelRegion { + + /// Private constructor, generation via findKMPCForkCalls. + KMPC_ForkParallelRegion(CallInst &KMPC_ForkCI, Function &ParallelSubFn, + ParallelIRRegionInfo &PRI); + +public: + + /// See @p ParallelRegion::getKind + virtual ParallelRegionKind getKind() const override { + return PRK_KMPC_FORK_RT; + } + + /// See @p ParallelRegion::isKind + virtual bool isKind(ParallelRegionKind Kind) const override { + return Kind | PRK_KMPC_FORK_RT; + } + + /// Find all KMPC fork calls in @p M and register the KMPC_ForkParallelRegion + /// parallel regions with @p PRI. + static void findKMPCForkCalls(Module &M, ParallelIRRegionInfo &PRI); +}; + +/// See @p KMPC_ParallelRegion +class KMPC_TaskParallelRegion : public KMPC_ParallelRegion { + + /// Private constructor, generation via findKMPCTaskCalls. + KMPC_TaskParallelRegion(CallInst &KMPC_TaskCI, Function &ParallelSubFn, + ParallelIRRegionInfo &PRI); + +public: + + /// See @p ParallelRegion::getKind + virtual ParallelRegionKind getKind() const override { + return PRK_KMPC_TASK_RT; + } + + /// See @p ParallelRegion::isKind + virtual bool isKind(ParallelRegionKind Kind) const override { + return Kind | PRK_KMPC_TASK_RT; + } + + /// Find all KMPC task calls in @p M and register the KMPC_ForkParallelRegion + /// parallel regions with @p PRI. + static void findKMPCTaskCalls(Module &M, ParallelIRRegionInfo &PRI); +}; + +} // namespace llvm +#endif Index: include/llvm/Analysis/ParallelIR/RegionInfo.h =================================================================== --- /dev/null +++ include/llvm/Analysis/ParallelIR/RegionInfo.h @@ -0,0 +1,348 @@ +//===- ParallelIRRegionInfo.h - Parallel region analysis --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Abstract analyses interfaces to inspect parallel codes and passes to provide +// information about parallelism inside the module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PARALLELIR_REGIONINFO_H +#define LLVM_ANALYSIS_PARALLELIR_REGIONINFO_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" + +namespace llvm { + +class ParallelRegion; +class ParallelIRBuilder; + +/// Helper structure that decouples communication related queries from the +/// parallel region interface. +/// +/// Communication is conceptually divided into two parts, the sequential and the +/// parallel one. The communicated values could be different on the other side +/// and they are therefore only identified by their index. The way values can be +/// communicated is either by-value or inside a container, thus through a +/// pointer type that is dereferenced. +struct ParallelIRCommunicationInfo { + + /// Return all parallel regions that might be involved with either side of + /// the parallel communication interface. There might be multiple if code was + /// reused or parts of the interface were duplicated, e.g., though unrolling. + virtual bool getAllCommunicatingParallelRegions( + SmallVectorImpl &CommunicatingParallelRegions) + const = 0; + + /// Flags to distinguish the different kinds of communication. + enum CommunicationKind { + CK_VALUE, ///< communication by-value + CK_CONTAINER_IN, ///< communication through a read-only container + CK_CONTAINER_OUT, ///< communication through a write-only container + CK_CONTAINER_IN_OUT, ///< communication through a container + CK_UNKNOWN, ///< unknown/complication communication + }; + + /// Return the number of communicated values. + virtual unsigned getNumCommunicatedValues() const = 0; + + /// Return the communicated value number @p Idx in the sequential code. + virtual Value *getCommunicatedValue(unsigned Idx) const = 0; + + /// Return the communicated value number @p Idx in the parallel code. + virtual Value *getCommunicatedValueInParallelRegion(unsigned Idx) const = 0; + + /// Return the kind of communication used for value number @p Idx. + virtual CommunicationKind getCommunicationKind(unsigned Idx) const = 0; + + /// Return all communicated values in the sequential code. + virtual void getCommunicatedValues( + SmallVectorImpl &CommunicatedValues) const = 0; + + /// Return true if the communication can be annotated with attributes. + virtual bool hasAnnotatableCommunication() const = 0; + + /// Return true if the communicated value @p Idx in the parallel code has + /// attribute @p Kind. If attribute annotation is not possible this function + /// shall gracefully return false. + virtual bool hasAttributeInParallelRegion(unsigned Idx, + Attribute::AttrKind Kind) const = 0; +}; + +/// The parallel region info (PRI) identifies parallel regions and provides +/// convenient information on them. +/// +/// Currently the parallel region info is "lazy" in the sense that it does only +/// need to be updated if new parallel regions are created (or deleted). As this +/// should not happen very often (and only in very few places) it allows +/// transformation passes to preserve the parallel region info without +/// modifications. Additionally, it makes the analysis very lightweight in the +/// absence of parallel regions (which should be the majority of functions). +/// +class ParallelIRRegionInfo { +public: + + /// Container type for parallel regions. + using ParallelRegionContainer = SmallVector; + using ParallelRegionContainerMap = + DenseMap; + + /// Iterator types for the parallel region container. + using iterator = ParallelRegionContainerMap::iterator; + using const_iterator = ParallelRegionContainerMap::const_iterator; + +private: + + /// The parallel regions discovered in the program. + ParallelRegionContainerMap ParallelRegionsMap; + + /// Register the parallel region @p PR. + void addParallelRegion(ParallelRegion &PR); + +public: + ParallelIRRegionInfo() {} + ParallelIRRegionInfo(Module &M) { + recalculate(M); + } + ~ParallelIRRegionInfo() { releaseMemory(); } + + /// Identify the parallel regions in @p M from scratch. + void recalculate(Module &M); + + /// Return the parallel region for @p I if any. + ParallelRegion *getParallelRegionFor(Instruction *I) const; + + /// Return a vector with all parallel regions in this function. + /// + ///{ + ParallelRegionContainer getParallelRegions(Function &F) const { + return ParallelRegionsMap.lookup(&F); + } + ///} + + /// Iterators to visit all parallel regions, function by function + /// + ///{ + iterator begin() { return ParallelRegionsMap.begin(); } + iterator end() { return ParallelRegionsMap.end(); } + + const_iterator begin() const { return ParallelRegionsMap.begin(); } + const_iterator end() const { return ParallelRegionsMap.end(); } + ///} + + /// Delete all memory allocated for parallel regions. + void releaseMemory(); + + /// Pretty print all parallel regions. + ///{ + void print(raw_ostream &OS) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const; +#endif + ///} + + friend class ParallelRegion; + friend class ParallelIRBuilder; +}; + +/// A parallel region is a single-entry, single-exit CFG region that +/// represents code that can be executed in parallel. +class ParallelRegion { + + /// The start point of this parallel region. + Instruction &StartPoint; + + /// The end point of this parallel region. + Instruction &EndPoint; + + /// The parallel region info analysis. + ParallelIRRegionInfo &PRI; + +protected: + + ParallelRegion(Instruction &StartPoint, Instruction &EndPoint, + ParallelIRRegionInfo &PRI) + : StartPoint(StartPoint), EndPoint(EndPoint), PRI(PRI) { + // Register a new parallel region always with the parallel region info. + PRI.addParallelRegion(*this); + } + +public: + virtual ~ParallelRegion(); + + /// Return the start point of this parallel region. + Instruction &getStartPoint() const { return StartPoint; } + + /// Return the end point of this parallel region. + Instruction &getEndPoint() const { return EndPoint; } + + /// Return the first instruction in this parallel region before which new code + /// can be inserted. + virtual Instruction &getFirstInsertionPoint() const = 0; + + /// Return the function that contains the sequential code surrounding the + /// parallel region. + virtual Function &getSequentialCodeFunction() const = 0; + + /// Return the function that contains the code executed in parallel. + virtual Function &getParallelCodeFunction() const = 0; + + /// Return all definite barrier instructions. + virtual void getDefiniteBarriers( + SmallVectorImpl &DefiniteBarriers) const = 0; + + /// Return all potential barrier instructions. + virtual void getPotentialBarriers( + SmallVectorImpl &PotentialBarriers) const = 0; + + /// Return true if @p I might have barrier semantics for this parallel region. + virtual bool isPotentialBarrier(Instruction &I) const; + + /// Enumeration of all known parallel region kinds. + enum ParallelRegionKind { + PRK_KMPC_RT = 4, ///< General KPMC runtime call + PRK_KMPC_FORK_RT = 5, ///< KPMC fork runtime call + PRK_KMPC_TASK_RT = 6, ///< KPMC task runtime call + }; + + /// Return the kind of this parallel region. + virtual ParallelRegionKind getKind() const = 0; + + /// Return true if this parallel region is of kind @p Kind. + virtual bool isKind(ParallelRegionKind Kind) const = 0; + + /// Return the global thread id if applicable and present. + /// + /// Note: The ParallelIR/Builder interface allows to create a new thread id. + virtual Value *getThreadId() const = 0; + + /// Return the local thread id if applicable and present. + /// + /// Note: The ParallelIR/Builder interface allows to create a new thread id. + virtual Value *getLocalThreadId() const = 0; + + /// Return a lightweight communication info object for this parallel region. + virtual const ParallelIRCommunicationInfo &getCommunicationInfo() const = 0; + + /// Type of the instruction visitor function. + /// + /// It will be invoked for every instruction in this parallel region until the + /// return value of the visitor is false. Note that only proper instructions + /// inside the parallel region are visited, thus no encoding instructions + /// only present to mark the parallel region. + /// + /// The return value indicates if the traversal should continue. + using InstructionVisitorTy = std::function; + + /// Type of the block visitor function. + /// + /// It will be invoked for every basic block in this parallel region until the + /// return value of the visitor is false. The second argument is true only if + /// the block is not completely contained in the parallel region. + /// + /// The return value indicates if the traversal should continue. + using BlockVisitorTy = std::function; + + /// A generic visitor interface as an alternative to an iterator. + /// + /// @returns True, if all instructions/blocks have been visited. + ///{ + virtual bool visit(InstructionVisitorTy &Visitor) const = 0; + virtual bool visit(BlockVisitorTy &Visitor) const = 0; + ///} + + /// The contain interface is designed deliberately different from similar + /// functions like Loop::contains(*) as it might take a dominator tree as a + /// second argument. This allows the ParallelRegion to remain valid even if + /// transformations change the CFG structure inside. As a consequence there + /// are less modifications needed in the existing code base. + ///{ + virtual bool contains(const BasicBlock *BB, + const DominatorTree *DT = nullptr) const; + virtual bool contains(const Instruction *I, + const DominatorTree *DT = nullptr) const; + ///} + + const ParallelIRRegionInfo &getParallelRegionInfo() const { return PRI; }; + + /// Pretty print this parallel region. + ///{ + virtual void print(raw_ostream &OS, unsigned indent = 0) const = 0; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const; +#endif + ///} + + friend class ParallelIRRegionInfo; +}; + +/// Pretty print the parallel region @p PR to @p OS. +inline raw_ostream &operator<<(raw_ostream &OS, const ParallelRegion &PR) { + PR.print(OS); + return OS; +} + +/// New pass manager wrapper pass around the parallel region info. +class ParallelIRRegionAnalysis + : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static AnalysisKey Key; + +public: + typedef ParallelIRRegionInfo Result; + + /// Run the analysis pass over a module and identify the parallel regions. + /// + /// FIXME: This does not need to be a module pass but dependent passes in the + /// old pass manager do not work otherwise. + ParallelIRRegionInfo run(Module &M, ModuleAnalysisManager &MAM); +}; + +/// Module pass wrapper around the parallel region info. +/// +/// FIXME: This does not need to be a module pass but dependent passes in the +/// old pass manager do not work otherwise. +class ParallelIRRegionInfoPass : public ModulePass { + ParallelIRRegionInfo PRI; + +public: + static char ID; + ParallelIRRegionInfoPass() : ModulePass(ID) {} + + /// Return the parallel region info analysis. + ///{ + ParallelIRRegionInfo &getParallelIRRegionInfo() { return PRI; } + const ParallelIRRegionInfo &getParallelIRRegionInfo() const { return PRI; } + ///} + + /// Initialize the parallel region info for this function. + bool runOnModule(Module &) override; + + /// Verify the analysis as well as some of the functions provided. + void verifyAnalysis() const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Pretty print the parallel regions of the function. + ///{ + void print(raw_ostream &OS, const Module *) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const; +#endif + ///} +}; + +} // End llvm namespace +#endif Index: include/llvm/Analysis/Passes.h =================================================================== --- include/llvm/Analysis/Passes.h +++ include/llvm/Analysis/Passes.h @@ -79,6 +79,13 @@ // FunctionPass *createRegionInfoPass(); + //===--------------------------------------------------------------------===// + // + // createParallelRegionInfoPass - This pass finds all parallel regions + // in a function. + // + ModulePass *createParallelIRRegionInfoPass(); + // Print module-level debug info metadata in human-readable form. ModulePass *createModuleDebugInfoPrinterPass(); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -61,6 +61,9 @@ /// Initialize all passes linked into the CodeGen library. void initializeTarget(PassRegistry&); +/// Initialize all passes linked into the ParallelIROpts library. +void initializeParallelIROpts(PassRegistry&); + void initializeAAEvalLegacyPassPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeADCELegacyPassPass(PassRegistry&); @@ -285,12 +288,14 @@ void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&); void initializeOptimizePHIsPass(PassRegistry&); void initializePAEvalPass(PassRegistry&); +void initializeParallelIRRegionInfoPassPass(PassRegistry&); void initializePEIPass(PassRegistry&); void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&); void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&); void initializePGOInstrumentationUseLegacyPassPass(PassRegistry&); void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry&); void initializePHIEliminationPass(PassRegistry&); +void initializeParallelIRAttributeAnnotatorLegacyPassPass(PassRegistry&); void initializePartialInlinerLegacyPassPass(PassRegistry&); void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&); void initializePatchableFunctionPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -47,6 +47,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Instrumentation/BoundsChecking.h" #include "llvm/Transforms/ObjCARC.h" +#include "llvm/Transforms/ParallelIR.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils.h" @@ -188,6 +189,8 @@ (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); (void) llvm::createExpandMemCmpPass(); + (void) llvm::createParallelIRRegionInfoPass(); + (void) llvm::createParallelIRAttributeAnnotatorLegacyPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); Index: include/llvm/Transforms/ParallelIR.h =================================================================== --- /dev/null +++ include/llvm/Transforms/ParallelIR.h @@ -0,0 +1,28 @@ +//===-- ParallelIR.h - Parallel IR Transformations --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for accessor functions that expose passes +// in the ParallelIR transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_PARALLEL_IR_H +#define LLVM_TRANSFORMS_PARALLEL_IR_H + +namespace llvm { + +class ModulePass; + +//===----------------------------------------------------------------------===// +// +ModulePass *createParallelIRAttributeAnnotatorLegacyPass(); + +} // End llvm namespace + +#endif Index: include/llvm/Transforms/ParallelIR/AttributeAnnotator.h =================================================================== --- /dev/null +++ include/llvm/Transforms/ParallelIR/AttributeAnnotator.h @@ -0,0 +1,30 @@ +//===- AttributeAnnotator.h ----- Annotate attr. from/to parallel regions -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_PARALLEL_IR_ATTRIBUTE_ANNOTATOR_H +#define LLVM_TRANSFORMS_PARALLEL_IR_ATTRIBUTE_ANNOTATOR_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Module; + +struct ParallelIRAttributeAnnotatorPass + : PassInfoMixin { + PreservedAnalyses run(Module &M, ModuleAnalysisManager &); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_PARALLEL_IR_REGION_MERGE_H Index: include/llvm/Transforms/ParallelIR/Builder.h =================================================================== --- /dev/null +++ include/llvm/Transforms/ParallelIR/Builder.h @@ -0,0 +1,50 @@ +//===- ParallelIR/Builder.h - Parallel region IR builder ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_PARALLELIR_BUILDER_H +#define LLVM_IR_PARALLELIR_BUILDER_H + +#include "llvm/Analysis/ParallelIR/RegionInfo.h" + +namespace llvm { + +/// Interface to modify and create parallel regions. The potentially different +/// implementation of this interface for the different kinds of parallel regions +/// will apply the modifications requested through these calls if that is +/// possible. If not, they shall gracefully ignore the request. +struct ParallelIRBuilder { + + /// Create a parallel IR builder for the region kind @p PRKind. + static ParallelIRBuilder *Create(ParallelIRRegionInfo &PRI, + ParallelRegion::ParallelRegionKind PRKind); + + /// Add the attribute @p Kind to the communicated value at index @p Idx in the + /// sequential part of the communication interface defined by @p PRCI. + virtual bool + addAttributeInSequentialRegion(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, + Attribute::AttrKind Kind) const = 0; + + /// Add the attribute @p Kind to the communicated value at index @p Idx in the + /// parallel part of the communication interface defined by @p PRCI. + virtual bool + addAttributeInParallelRegion(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, + Attribute::AttrKind Kind) const = 0; + + /// Add the attribute @p Kind to the communicated value at index @p Idx in + /// both parts of the communication interface defined by @p PRCI. + virtual bool addAttribute(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, Attribute::AttrKind Kind) const = 0; +}; + +} +#endif Index: lib/Analysis/Analysis.cpp =================================================================== --- lib/Analysis/Analysis.cpp +++ lib/Analysis/Analysis.cpp @@ -82,6 +82,7 @@ initializeLCSSAVerificationPassPass(Registry); initializeMemorySSAWrapperPassPass(Registry); initializeMemorySSAPrinterLegacyPassPass(Registry); + initializeParallelIRRegionInfoPassPass(Registry); } void LLVMInitializeAnalysis(LLVMPassRegistryRef R) { Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -87,6 +87,9 @@ ValueTracking.cpp VectorUtils.cpp + ParallelIR/RegionInfo.cpp + ParallelIR/KMPCImpl.cpp + ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis Index: lib/Analysis/ParallelIR/KMPCImpl.cpp =================================================================== --- /dev/null +++ lib/Analysis/ParallelIR/KMPCImpl.cpp @@ -0,0 +1,282 @@ +//===- KMPCImpl.cpp - OpenMP runtime (KMPC) parallel region impl. ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the parallel regions for the OpenMP KMPC runtime library +// call representation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ParallelIR/KMPCImpl.h" + +#include "llvm/IR/Verifier.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "parallel-region-info" + +//===----------------------------------------------------------------------===// +// KMPC runtime parallel region implementation +// + +/// Return true if @p V is a call to the function @p Name in @p F. +static bool isCallToFunctionIn(Value *V, Function *CalledDecl, Function *F) { + auto *CI = dyn_cast(V); + return (CI && CI->getCalledFunction() == CalledDecl && + (!F || CI->getFunction() == F)); +} + +/// Put all calls to @p Name in @p F into the container @p Calls as @p RetTy. +template +static void collectCallsToInFunction(std::string Name, Function *F, Module &M, + SmallVectorImpl &Calls) { + + // Look for the "Name" function declaration in the Module. If found, the users + // are possible calls. + Function *FunctionDecl = M.getFunction(Name); + if (!FunctionDecl) + return; + + for (User *U : FunctionDecl->users()) + if (isCallToFunctionIn(U, FunctionDecl, F)) + Calls.push_back(cast(U)); +} + +Instruction &KMPC_ParallelRegion::getFirstInsertionPoint() const { + return ParallelSubFn.getEntryBlock().front(); + } + +Function &KMPC_ParallelRegion::getSequentialCodeFunction() const { + return *getStartPoint().getFunction(); +} + +Function &KMPC_ParallelRegion::getParallelCodeFunction() const { + return ParallelSubFn; +} + +void KMPC_ParallelRegion::getDefiniteBarriers( + SmallVectorImpl &DefiniteBarriers) const { + collectCallsToInFunction("__kmpc_barrier", &ParallelSubFn, + *ParallelSubFn.getParent(), DefiniteBarriers); +} + +void KMPC_ParallelRegion::getPotentialBarriers( + SmallVectorImpl &PotentialBarriers) const { + InstructionVisitorTy BarrierCollector = [&](Instruction &I) { + if (!ParallelRegion::isPotentialBarrier(I)) + return true; + + CallInst *CI = dyn_cast(&I); + if (!CI) + return true; + + if (CI->getCalledFunction()) { + const StringRef &Name = CI->getCalledFunction()->getName(); + if (Name == "__kmpc_for_static_init_4" || + Name == "__kmpc_for_static_fini") + return true; + } + + PotentialBarriers.push_back(&I); + return true; + }; + + visit(BarrierCollector); +} + +Value *KMPC_ParallelRegion::getThreadId() const { + Value *ThreadIdPtr = ParallelSubFn.arg_begin(); + + for (Value *User : ThreadIdPtr->users()) + if (LoadInst *LI = dyn_cast(User)) + return LI; + + return nullptr; +} + +Value *KMPC_ParallelRegion::getLocalThreadId() const { + Value *LocalThreadIdPtr = ParallelSubFn.arg_begin() + 1; + + for (Value *User : LocalThreadIdPtr->users()) + if (LoadInst *LI = dyn_cast(User)) + return LI; + + return nullptr; +} + +bool KMPC_ParallelRegion::contains(const BasicBlock *BB, + const DominatorTree *) const { + return BB->getParent() == &ParallelSubFn; +} + +bool KMPC_ParallelRegion::contains(const Instruction *I, + const DominatorTree *) const { + return I->getFunction() == &ParallelSubFn || I == &getStartPoint(); +} + +bool KMPC_ParallelRegion::visit(InstructionVisitorTy &Visitor) const { + for (BasicBlock &BB : ParallelSubFn) + for (Instruction &I : BB) + if (!Visitor(I)) + return false; + return true; +} + +bool KMPC_ParallelRegion::visit(BlockVisitorTy &Visitor) const { + for (BasicBlock &BB : ParallelSubFn) + if (!Visitor(BB, false)) + return false; + return true; +} + +void KMPC_ParallelRegion::print(raw_ostream &OS, unsigned indent) const { + OS.indent(indent) << "Parallel Region [" << getKind() << "]:\n"; + OS.indent(indent) << " fork call: " << getStartPoint() << "\n"; + OS.indent(indent) << " sub-function: " << ParallelSubFn.getName() + << "\n"; + + const ParallelIRCommunicationInfo &CI = getCommunicationInfo(); + for (unsigned u = 0; u < CI.getNumCommunicatedValues(); u++) + OS.indent(indent) << " communicated: " << *CI.getCommunicatedValue(u) + << " : " << CI.getCommunicationKind(u) << "\n"; +} + +//===----------------------------------------------------------------------===// +// KMPC_TaskParallelRegion implementation +// + +KMPC_TaskParallelRegion::KMPC_TaskParallelRegion(CallInst &KMPC_TaskCI, + Function &ParallelSubFn, + ParallelIRRegionInfo &PRI) + : KMPC_ParallelRegion(KMPC_TaskCI, ParallelSubFn, PRI) {} + +void KMPC_TaskParallelRegion::findKMPCTaskCalls( + Module &M, ParallelIRRegionInfo &PRI) { + SmallVector KMPC_TaskCalls; + collectCallsToInFunction("__kmpc_omp_task", nullptr, M, KMPC_TaskCalls); + + // Calls of the "__kmpc_omp_task" function are actually parallel regions. + for (CallInst *CI : KMPC_TaskCalls) { + + Function *ParallelFunc = + cast(CI->getArgOperand(2)->stripPointerCasts()); + assert(ParallelFunc); + + new KMPC_TaskParallelRegion(*CI, *ParallelFunc, PRI); + } +} + +void findKMPCTaskCalls(Module &M, ParallelIRRegionInfo &PRI) { + KMPC_TaskParallelRegion::findKMPCTaskCalls(M, PRI); +} + +//===----------------------------------------------------------------------===// +// KMPC_ForkParallelRegion implementation +// + +KMPC_ForkParallelRegion::KMPC_ForkParallelRegion(CallInst &KMPC_ForkCI, + Function &ParallelSubFn, + ParallelIRRegionInfo &PRI) + : KMPC_ParallelRegion(KMPC_ForkCI, ParallelSubFn, PRI) {} + +void KMPC_ForkParallelRegion::findKMPCForkCalls( + Module &M, ParallelIRRegionInfo &PRI) { + SmallVector KMPC_ForkCalls; + collectCallsToInFunction("__kmpc_fork_call", nullptr, M, KMPC_ForkCalls); + + // Calls of the "__kmpc_fork_call" function are actually parallel regions. + for (CallInst *CI : KMPC_ForkCalls) { + + Function *ParallelFunc = + cast(CI->getArgOperand(2)->stripPointerCasts()); + assert(ParallelFunc); + + new KMPC_ForkParallelRegion(*CI, *ParallelFunc, PRI); + } +} + +void findKMPCForkCalls(Module &M, ParallelIRRegionInfo &PRI) { + KMPC_ForkParallelRegion::findKMPCForkCalls(M, PRI); +} + +//===----------------------------------------------------------------------===// +// CommunicationInfo implementation +// + +CallInst &KMPC_CommunicationInfo::getRTCall() const { + return cast(PR.getStartPoint()); +} + +bool KMPC_CommunicationInfo::getAllCommunicatingParallelRegions( + SmallVectorImpl &CommunicatingParallelRegions) const { + + Function &ParallelFn = PR.getParallelSubFn(); + const ParallelIRRegionInfo &PRI = PR.getParallelRegionInfo(); + for (const auto &It : PRI) + for (ParallelRegion *PR : It.getSecond()) + if (&PR->getParallelCodeFunction() == &ParallelFn) + CommunicatingParallelRegions.push_back(PR); + + return true; +} + +unsigned KMPC_CommunicationInfo::getNumCommunicatedValues() const { + CallInst &CI = getRTCall(); + return CI.getNumArgOperands() - 3; +} + +Value *KMPC_CommunicationInfo::getCommunicatedValue(unsigned Idx) const { + CallInst &CI = getRTCall(); + return CI.getArgOperand(Idx + 3); +} + +ParallelIRCommunicationInfo::CommunicationKind +KMPC_CommunicationInfo::getCommunicationKind(unsigned Idx) const { + Value *CommunicatedValue = getCommunicatedValue(Idx); + if (!CommunicatedValue->getType()->isPointerTy()) + return CK_VALUE; + + Argument *CommunicatedValueArg = + cast(getCommunicatedValueInParallelRegion(Idx)); + if (!CommunicatedValueArg->hasNoCaptureAttr()) + return CK_VALUE; + + if (CommunicatedValueArg->hasAttribute(Attribute::ReadNone)) + return CK_VALUE; + + if (CommunicatedValueArg->hasAttribute(Attribute::WriteOnly)) + return CK_CONTAINER_OUT; + + if (CommunicatedValueArg->hasAttribute(Attribute::ReadOnly)) + return CK_CONTAINER_IN; + + return CK_CONTAINER_IN_OUT; +} + +void KMPC_CommunicationInfo::getCommunicatedValues( + SmallVectorImpl &CommunicatedValues) const { + CallInst &CI = getRTCall(); + for (unsigned u = 3, e = CI.getNumArgOperands(); u < e; u++) + CommunicatedValues.push_back(CI.getArgOperand(u)); +} + +Value *KMPC_CommunicationInfo::getCommunicatedValueInParallelRegion( + unsigned Idx) const { + return PR.getParallelSubFn().arg_begin() + 2 + Idx; +} + +bool KMPC_CommunicationInfo::hasAnnotatableCommunication() const { + return PR.getKind() == ParallelRegion::PRK_KMPC_FORK_RT; +} + +bool KMPC_CommunicationInfo::hasAttributeInParallelRegion(unsigned Idx, + Attribute::AttrKind Kind) const { + return cast(getCommunicatedValueInParallelRegion(Idx))->hasAttribute(Kind); +} Index: lib/Analysis/ParallelIR/RegionInfo.cpp =================================================================== --- /dev/null +++ lib/Analysis/ParallelIR/RegionInfo.cpp @@ -0,0 +1,174 @@ +//===- ParallelIRRegionInfo.cpp - Parallel region detection analysis +//--------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the ParallelIR/RegionInfo analysis. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ParallelIR/RegionInfo.h" + +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/ADT/Statistic.h" + +using namespace llvm; + +#define DEBUG_TYPE "parallel-region-info" + +STATISTIC(NumParallelRegions, "The # of parallel regions"); + +//===----------------------------------------------------------------------===// +// ParallelRegion implementation +// + +ParallelRegion::~ParallelRegion() {} + +bool ParallelRegion::isPotentialBarrier(Instruction &I) const { + + if (!isa(I)) + return false; + + if (isa(I)) + return false; + + if (!I.mayHaveSideEffects() && !I.mayReadFromMemory()) + return false; + + return true; +} + +bool ParallelRegion::contains(const BasicBlock *BB, + const DominatorTree *DT) const { + bool Contains = false; + + // Fallback to a search of all blocks in this task. + BlockVisitorTy BBVisitor = [BB, &Contains](BasicBlock &CurrentBB, + bool Boundary) { + if (BB != &CurrentBB) + return true; + Contains = !Boundary; + return false; + }; + + visit(BBVisitor); + return Contains; +} + +bool ParallelRegion::contains(const Instruction *I, + const DominatorTree *DT) const { + bool Contains = false; + + // Fallback to a search of all blocks in this task. + InstructionVisitorTy InstVisitor = [I, &Contains](Instruction &CurI) { + if (I != &CurI) + return true; + Contains = true; + return false; + }; + + visit(InstVisitor); + return Contains; +} + +void ParallelRegion::dump() const { return print(dbgs()); } + +//===----------------------------------------------------------------------===// +// ParallelIR/RegionInfo implementation +// + +void ParallelIRRegionInfo::addParallelRegion(ParallelRegion &PR) { + NumParallelRegions++; + ParallelRegionsMap[PR.getStartPoint().getFunction()].push_back(&PR); +} + +void ParallelIRRegionInfo::print(raw_ostream &OS) const { + for (auto &It : ParallelRegionsMap) { + assert(It.second.size()); + OS << "Parallel region in " << It.first->getName() << " [" + << It.second.size() << "]:\n"; + for (auto *PR : It.second) + PR->print(OS); + } +} + +void ParallelIRRegionInfo::dump() const { print(dbgs()); } + +void ParallelIRRegionInfo::releaseMemory() { + for (auto &It : ParallelRegionsMap) + DeleteContainerPointers(It.second); + ParallelRegionsMap.clear(); +} + +void findKMPCForkCalls(Module &, ParallelIRRegionInfo &); + +void ParallelIRRegionInfo::recalculate(Module &M) { + releaseMemory(); + + bool RecognizeKMPCFork = true; + + if (RecognizeKMPCFork) + findKMPCForkCalls(M, *this); + +} + +ParallelRegion *ParallelIRRegionInfo::getParallelRegionFor(Instruction *I) const { + for (ParallelRegion *PR : getParallelRegions(*I->getFunction())) + if (PR->contains(I)) + return PR; + return nullptr; +} + +//===----------------------------------------------------------------------===// +// ParallelRegionAnalysis implementation +// + +AnalysisKey ParallelIRRegionAnalysis::Key; + +ParallelIRRegionInfo +ParallelIRRegionAnalysis::run(Module &M, ModuleAnalysisManager &MAM) { + return ParallelIRRegionInfo(M); +} + +//===----------------------------------------------------------------------===// +// ParallelIRRegionInfoPass implementation +// + +bool ParallelIRRegionInfoPass::runOnModule(Module &M) { + PRI.recalculate(M); + return false; +} + +void ParallelIRRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +void ParallelIRRegionInfoPass::print(raw_ostream &OS, const Module *) const { + PRI.print(OS); +} + +void ParallelIRRegionInfoPass::verifyAnalysis() const { + // TODO Not implemented but merely a stub. +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void ParallelIRRegionInfoPass::dump() const { PRI.dump(); } +#endif + +char ParallelIRRegionInfoPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ParallelIRRegionInfoPass, "pir-regions", + "Detect parallel regions", false, true) +INITIALIZE_PASS_END(ParallelIRRegionInfoPass, "pir-regions", + "Detect parallel regions", false, true) + +namespace llvm { +ModulePass *createParallelIRRegionInfoPass() { + return new ParallelIRRegionInfoPass(); +} +} // namespace llvm Index: lib/Passes/LLVMBuild.txt =================================================================== --- lib/Passes/LLVMBuild.txt +++ lib/Passes/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Passes parent = Libraries -required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation +required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation ParallelIR Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" +#include "llvm/Analysis/ParallelIR/RegionInfo.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/CodeGen/UnreachableBlockElim.h" #include "llvm/IR/Dominators.h" @@ -137,6 +138,7 @@ #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h" #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/Transforms/ParallelIR/AttributeAnnotator.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/BreakCriticalEdges.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -26,6 +26,7 @@ MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis()) MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) MODULE_ANALYSIS("verify", VerifierAnalysis()) +MODULE_ANALYSIS("pir-regions", ParallelIRRegionAnalysis()) #ifndef MODULE_ALIAS_ANALYSIS #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ @@ -76,6 +77,7 @@ MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass()) MODULE_PASS("verify", VerifierPass()) +MODULE_PASS("pir-attribute-annotator", ParallelIRAttributeAnnotatorPass()) #undef MODULE_PASS #ifndef CGSCC_ANALYSIS Index: lib/Transforms/CMakeLists.txt =================================================================== --- lib/Transforms/CMakeLists.txt +++ lib/Transforms/CMakeLists.txt @@ -8,3 +8,4 @@ add_subdirectory(Hello) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) +add_subdirectory(ParallelIR) Index: lib/Transforms/IPO/LLVMBuild.txt =================================================================== --- lib/Transforms/IPO/LLVMBuild.txt +++ lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ name = IPO parent = Transforms library_name = ipo -required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation ParallelIR Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -36,6 +36,7 @@ #include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/ParallelIR.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" @@ -469,6 +470,17 @@ // Infer attributes about declarations if possible. MPM.add(createInferFunctionAttrsLegacyPass()); + if (OptLevel > 2) { + // Add parallel optimizations to the pass pipeline. + // FIXME: This should only happen if the input contains + // parallel constructs as we also add canonicalization + // passes that might disturb the regular pipeline. + // TODO: We actually only need CG SCC passes but as we rely on + // function passes the old pass manager forces us to use + // module passes here. + MPM.add(createParallelIRAttributeAnnotatorLegacyPass()); + } + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); if (OptLevel > 2) Index: lib/Transforms/LLVMBuild.txt =================================================================== --- lib/Transforms/LLVMBuild.txt +++ lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC +subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC ParallelIR [component_0] type = Group Index: lib/Transforms/ParallelIR/AttributeAnnotator.cpp =================================================================== --- /dev/null +++ lib/Transforms/ParallelIR/AttributeAnnotator.cpp @@ -0,0 +1,388 @@ +//===- AttributeAnnotator.cpp -- Annotate attr. from/to parallel regions -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Attribute annotator for parallel regions. +// +// This pass tries to add attributes to the instructions representing parallel +// regions but also to the parallel regions itself, e.g., their arguments if +// applicable. +// +// TODO: This should actually be a SCC pass on the call graph. However, the old +// pass manager doesn't allow us to use the function analyses the way we +// want/need to so it is a module pass for now. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/ParallelIR/AttributeAnnotator.h" + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ParallelIR/RegionInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/ParallelIR.h" +#include "llvm/Transforms/ParallelIR/Builder.h" + +#define DEBUG_TYPE "pir-attribute-annotator" + +using namespace llvm; + +STATISTIC(NumNoAliasArguments, "Number of no-alias parallel region arguments"); +STATISTIC(NumNoCaptureParameters, + "Number of no-capture parallel region parameters"); +STATISTIC(NumReadNoneParameters, + "Number of read-none parallel region parameters"); +STATISTIC(NumReadOnlyParameters, + "Number of read-only parallel region parameters"); +STATISTIC(NumWriteOnlyParameters, + "Number of write-only parallel region parameters"); + +static cl::opt AnnotateAttributes("pir-annotate-attributes", + cl::desc("Annotate attributes"), + cl::Hidden, cl::init(true), + cl::ZeroOrMore); + +namespace { + +// FIXME: Helper type necessary as long as the parallel IR passes are +// implemented as module not SCC passes on the CG. +template +using FuncResultProviderTy = std::function; + +/// Transformer that identifies merge opportunities and applies code changes. +struct AttributeAnnotator { + + /// Constructor that accepts the parallel region info and analyses providers. + /// + /// @param PRI The parallel region info for this module. + /// @param AAProvider Callback to get alias information for a function. + AttributeAnnotator(ParallelIRRegionInfo &PRI, + FuncResultProviderTy &AAProvider, + FuncResultProviderTy &DTProvider) + : PRI(PRI), AAProvider(AAProvider), DTProvider(DTProvider) {} + + /// Run the attribute Annotator pass on the parallel regions in @p M. + /// + /// @param M The module to run on. + /// + /// @returns True, if any change was made, false otherwise. + bool runOnModule(Module &M); + +private: + /// Parameter attributes that we try to move from inside the parallel region + /// to the outside. + Attribute::AttrKind const ParameterAttributes[4] = { + Attribute::NoCapture, Attribute::ReadNone, Attribute::ReadOnly, + Attribute::WriteOnly}; + + std::string const ParameterAttributeNames[4] = {"no-capture", "read-none", + "read-only", "write-only"}; + + /// Statistics to keep track of annotated parameters (see above). + Statistic *const ParameterAttributeStatistics[4] = { + &NumNoCaptureParameters, &NumReadNoneParameters, &NumReadOnlyParameters, + &NumWriteOnlyParameters}; + + /// Try to annotate arguments with no-alias, nocapture, etc. attributes. + bool annotateArgumentAttributes(ParallelRegion &PR); + + /// Annotate parallel region instructions with domain knowledge. + bool annotateParallelRepresentation(ParallelRegion &PR); + + /// The parallel region info for the module. + ParallelIRRegionInfo &PRI; + + /// Function analyses provider callbacks. + /// + ///{ + FuncResultProviderTy &AAProvider; + FuncResultProviderTy &DTProvider; + ///} +}; + +bool AttributeAnnotator::annotateParallelRepresentation(ParallelRegion &PR) { + switch (PR.getKind()) { + case ParallelRegion::PRK_KMPC_RT: { + // We know the parallel runtime library call does not throw an exception. + assert(isa(PR.getStartPoint()) && + "Expected runtime call for KMPC parallel region"); + cast(PR.getStartPoint()).doesNotThrow(); + return true; + } + default: + return false; + } +} + +bool AttributeAnnotator::runOnModule(Module &M) { + if (!AnnotateAttributes) + return false; + + bool Changed = false; + for (auto &It : PRI) { + for (ParallelRegion *PR : It.second) { + Changed |= annotateParallelRepresentation(*PR); + Changed |= annotateArgumentAttributes(*PR); + } + } + + return Changed; +} + +bool AttributeAnnotator::annotateArgumentAttributes(ParallelRegion &PR) { + // Try to set no-alias, no-capture, etc. argument annotations if possible. + + LLVM_DEBUG(dbgs() << "Try to annotate argument attributes to " + << PR.getStartPoint() << "\n"); + + const ParallelIRCommunicationInfo &PRCI = PR.getCommunicationInfo(); + if (!PRCI.hasAnnotatableCommunication()) { + LLVM_DEBUG(dbgs() << " - Parallel region kind " << PR.getKind() + << " does not support communication attributes, skip!\n"); + return false; + } + + SmallVector CommunicatingParallelRegions; + if (!PRCI.getAllCommunicatingParallelRegions(CommunicatingParallelRegions)) { + LLVM_DEBUG( + dbgs() + << " - Communication involves an unknown user, skip for now!\n"); + return false; + } + + // Unrolling and inlining might have duplicated the (indirect) call sites of + // a outlined parallel region. These situations are not supported yet as they + // would require us to intersect the information from all call sites. + if (CommunicatingParallelRegions.size() > 1) { + LLVM_DEBUG(dbgs() << " - Communication involves " + << CommunicatingParallelRegions.size() + << " parallel regions, skip for now!\n"); + return false; + } + assert(!CommunicatingParallelRegions.empty()); + assert(CommunicatingParallelRegions.front() == &PR); + + SmallVector CommunicatedValues; + PRCI.getCommunicatedValues(CommunicatedValues); + + Instruction &PRStartInst = PR.getStartPoint(); + DominatorTree &DT = DTProvider(*PRStartInst.getFunction()); + + auto *PIRBuilder = ParallelIRBuilder::Create(PRI, PR.getKind()); + + // Set of pointer parameters that might be alias free in the parallel region, + // thus no-alias arguments. + SmallPtrSet NoAliasCandidates; + + bool Changed = false; + unsigned NumCommunicatedValues = CommunicatedValues.size(); + for (unsigned Idx = 0; Idx < NumCommunicatedValues; Idx++) { + if (!CommunicatedValues[Idx]) + continue; + + if (isa(CommunicatedValues[Idx])) + continue; + + // For now we skip all non-pointer-type values. + if (!CommunicatedValues[Idx]->getType()->isPointerTy()) + continue; + + int NumAttributes = + sizeof(ParameterAttributes) / sizeof(ParameterAttributes[0]); + + // This is for bookkeeping purposes only. + int NumAttributeStatistics = sizeof(ParameterAttributeStatistics) / + sizeof(ParameterAttributeStatistics[0]); + assert(NumAttributes <= NumAttributeStatistics && + "Require at least as many attribute statistics as there are " + "attributes!\n"); + + // Propagate the known argument/parameter attributes. + for (int i = 0; i < NumAttributes; i++) { + if (!PRCI.hasAttributeInParallelRegion(Idx, ParameterAttributes[i])) + continue; + + if (!PIRBuilder->addAttributeInSequentialRegion(PRCI, Idx, + ParameterAttributes[i])) + continue; + + LLVM_DEBUG({ + int NumAttributeNames = sizeof(ParameterAttributeNames) / + sizeof(ParameterAttributeNames[0]); + assert(NumAttributes == NumAttributeNames); + dbgs() << " - Argument " << Idx << " is tagged with " + << ParameterAttributeNames[i] << "\n"; + }); + + (*ParameterAttributeStatistics[i])++; + Changed = true; + } + + // After we propagated "local" parameter attributes we proceed to check + // if this argument could be marked as no-alias. If it already is marked, or + // if it might alias with anything, we skip it. To check for the latter, we + // first require the argument to be identified as function local and not + // captured up to the point of the parallel region. Later we also verify + // they do not alias with other arguments to the parallel region. + if (PRCI.hasAttributeInParallelRegion(Idx, Attribute::NoAlias)) + continue; + if (!isIdentifiedFunctionLocal(CommunicatedValues[Idx])) + continue; + + if (PointerMayBeCapturedBefore(CommunicatedValues[Idx], false, true, + &PRStartInst, &DT)) + continue; + + NoAliasCandidates.insert(CommunicatedValues[Idx]); + } + + if (NoAliasCandidates.empty()) { + LLVM_DEBUG(dbgs() << " - Parallel region has no no-alias candidates.\n"); + return Changed; + } + + SmallVector PotentialBarriers; + PR.getPotentialBarriers(PotentialBarriers); + + // This initial version does not support potential (and actual) barriers as + // the no-alias attributes would interfere with them. A way to combine + // no-alias attributes and potential/actual barriers is the use of operand + // bundles. + if (!PotentialBarriers.empty()) { + LLVM_DEBUG(dbgs() << " - Parallel region contains " + << PotentialBarriers.size() + << " potential barriers, skip for now!\n"); + return Changed; + } + + // While all no-alias candidates do not alias with globals or pointers loaded + // from memory they might alias with other arguments. To check this we put + // them all in an alias set tracker and filter out singleton alias sets. + AliasAnalysis &AA = AAProvider(*PRStartInst.getFunction()); + AliasSetTracker AST(AA); + AAMDNodes AATags; + + for (Value *CommunicatedValue : CommunicatedValues) { + if (!CommunicatedValue) + continue; + AST.add(CommunicatedValue, MemoryLocation::UnknownSize, AATags); + } + + for (unsigned Idx = 0; Idx < NumCommunicatedValues; Idx++) { + if (!NoAliasCandidates.count(CommunicatedValues[Idx])) + continue; + + assert(CommunicatedValues[Idx]); + assert(CommunicatedValues[Idx]->getType()->isPointerTy()); + const auto &AliasSet = AST.getAliasSetForPointer( + CommunicatedValues[Idx], MemoryLocation::UnknownSize, AATags); + + // Check for singleton alias sets, thus pointers that do not alias. + if (++AliasSet.begin() != AliasSet.end()) + continue; + assert(AliasSet.isMustAlias()); + + if (!PIRBuilder->addAttribute(PRCI, Idx, Attribute::NoAlias)) + continue; + + LLVM_DEBUG(dbgs() << " - Argument " << Idx << " is tagged as no-alias\n"); + + Changed = true; + NumNoAliasArguments++; + } + + return Changed; +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// Pass Manager integration code +// +//===----------------------------------------------------------------------===// +PreservedAnalyses +ParallelIRAttributeAnnotatorPass::run(Module &M, ModuleAnalysisManager &MAM) { + auto &FAM = MAM.getResult(M).getManager(); + FuncResultProviderTy AAProvider = + [&](Function &F) -> AliasAnalysis & { + return FAM.getResult(F); + }; + FuncResultProviderTy DTProvider = + [&](Function &F) -> DominatorTree & { + return FAM.getResult(F); + }; + + ParallelIRRegionInfo &PRI = MAM.getResult(M); + AttributeAnnotator PRM(PRI, AAProvider, DTProvider); + if (PRM.runOnModule(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { + +struct ParallelIRAttributeAnnotatorLegacyPass : public ModulePass { + static char ID; + + ParallelIRAttributeAnnotatorLegacyPass() : ModulePass(ID) { + initializeParallelIRAttributeAnnotatorLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + + FuncResultProviderTy DTProvider = + [&](Function &F) -> DominatorTree & { + return getAnalysis(F).getDomTree(); + }; + FuncResultProviderTy AAProvider = + [&](Function &F) -> AliasAnalysis & { + return getAnalysis(F).getAAResults(); + }; + + ParallelIRRegionInfo &PRI = + getAnalysis().getParallelIRRegionInfo(); + AttributeAnnotator RM(PRI, AAProvider, DTProvider); + + return RM.runOnModule(M); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } +}; + +} // end anonymous namespace + +char ParallelIRAttributeAnnotatorLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ParallelIRAttributeAnnotatorLegacyPass, + "pir-attribute-annotator", + "Annotate attributes to parallel region", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ParallelIRRegionInfoPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(ParallelIRAttributeAnnotatorLegacyPass, + "pir-attribute-annotator", + "Annotate attributes to parallel region", false, false) + +ModulePass *llvm::createParallelIRAttributeAnnotatorLegacyPass() { + return new ParallelIRAttributeAnnotatorLegacyPass(); +} Index: lib/Transforms/ParallelIR/Builder.cpp =================================================================== --- /dev/null +++ lib/Transforms/ParallelIR/Builder.cpp @@ -0,0 +1,42 @@ +//===- ParallelIR/Builder.cpp - Parallel region IR builder ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the general (and abstract) parallel IR builder interface. +// The interface allows to manipulate parallel regions regardless of the +// underlying representation by representation specific implementations in the +// subclasses. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/ParallelIR/Builder.h" + +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "pir-builder" + +using namespace llvm; + +ParallelIRBuilder *createKMPCIRBuilder(ParallelIRRegionInfo &, + ParallelRegion::ParallelRegionKind PRKind); + +ParallelIRBuilder * +ParallelIRBuilder::Create(ParallelIRRegionInfo &PRI, + ParallelRegion::ParallelRegionKind PRKind) { + + switch (PRKind) { + case ParallelRegion::PRK_KMPC_RT: + case ParallelRegion::PRK_KMPC_TASK_RT: + case ParallelRegion::PRK_KMPC_FORK_RT: + return createKMPCIRBuilder(PRI, PRKind); + default: + break; + } + + llvm_unreachable("No builder for chosen parallel region kind available!"); +} Index: lib/Transforms/ParallelIR/CMakeLists.txt =================================================================== --- /dev/null +++ lib/Transforms/ParallelIR/CMakeLists.txt @@ -0,0 +1,13 @@ +add_llvm_library(LLVMParallelIROpts + ParallelIR.cpp + Builder.cpp + AttributeAnnotator.cpp + KMPCImpl.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/ParallelIR + + DEPENDS + intrinsics_gen + ) Index: lib/Transforms/ParallelIR/KMPCImpl.cpp =================================================================== --- /dev/null +++ lib/Transforms/ParallelIR/KMPCImpl.cpp @@ -0,0 +1,123 @@ +//===- KMPCImpl.cpp - Parallel IR Transformation Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the parallel IR tranformation interface for parallel +// regions represented with KMPC (OpenMP) runtime calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/ParallelIR/Builder.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ParallelIR/KMPCImpl.h" +#include "llvm/Analysis/ParallelIR/RegionInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "pir-builder" + +//===----------------------------------------------------------------------===// +// PIRBuilder specialization for the OpenMP KMPC runtime library. +// + +struct KMPC_ParallelIRBuilder : public ParallelIRBuilder { + + KMPC_ParallelIRBuilder(ParallelIRRegionInfo &PRI, + ParallelRegion::ParallelRegionKind PRKind) + : PRI(PRI), PRKind(PRKind) { + assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT || + PRKind == ParallelRegion::PRK_KMPC_TASK_RT); + } + + /// Return the offset at which the first subfunction call argument is located. + unsigned getFirstArgumentOffset() const; + + /// Return the offset at which the first subfunction parameter is located. + unsigned getFirstParameterOffset() const; + + /// See @ParallelIRBuilder::addAttributeInSequentialRegion + virtual bool + addAttributeInSequentialRegion(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, + Attribute::AttrKind Kind) const override; + + /// See @ParallelIRBuilder::addAttributeInParallelRegion + virtual bool + addAttributeInParallelRegion(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, + Attribute::AttrKind Kind) const override; + + /// See @ParallelIRBuilder::addAttribute + virtual bool addAttribute(const ParallelIRCommunicationInfo &PRCI, + unsigned Idx, + Attribute::AttrKind Kind) const override; + + /// The parallel region info pass. + ParallelIRRegionInfo &PRI; + + /// The actual parallel region info kind this builder was created for. + /// There are multiple KMPC encodings (tasks/forks) that can be distinguished + /// this way. + ParallelRegion::ParallelRegionKind PRKind; +}; + +//===----------------------------------------------------------------------===// +// KMPC_ParallelIRBuilder implementation + +ParallelIRBuilder * +createKMPCIRBuilder(ParallelIRRegionInfo &PRI, + ParallelRegion::ParallelRegionKind PRKind) { + return new KMPC_ParallelIRBuilder(PRI, PRKind); +} + +unsigned KMPC_ParallelIRBuilder::getFirstParameterOffset() const { + assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT || + PRKind == ParallelRegion::PRK_KMPC_TASK_RT); + return PRKind == ParallelRegion::PRK_KMPC_FORK_RT ? 2 : 1; +} + +unsigned KMPC_ParallelIRBuilder::getFirstArgumentOffset() const { + assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT || + PRKind == ParallelRegion::PRK_KMPC_TASK_RT); + return PRKind == ParallelRegion::PRK_KMPC_FORK_RT ? 3 : 2; +} + +bool KMPC_ParallelIRBuilder::addAttributeInSequentialRegion( + const ParallelIRCommunicationInfo &PRCI, unsigned Idx, + Attribute::AttrKind Kind) const { + + const KMPC_CommunicationInfo &KMPCCI = + static_cast(PRCI); + CallInst &RTCall = KMPCCI.getRTCall(); + RTCall.addParamAttr(Idx + getFirstArgumentOffset(), Kind); + return true; +} + +bool KMPC_ParallelIRBuilder::addAttributeInParallelRegion( + const ParallelIRCommunicationInfo &PRCI, unsigned Idx, + Attribute::AttrKind Kind) const { + Argument *Arg = + cast(PRCI.getCommunicatedValueInParallelRegion(Idx)); + Arg->addAttr(Kind); + return true; +} + +bool KMPC_ParallelIRBuilder::addAttribute( + const ParallelIRCommunicationInfo &PRCI, unsigned Idx, + Attribute::AttrKind Kind) const { + addAttributeInSequentialRegion(PRCI, Idx, Kind); + addAttributeInParallelRegion(PRCI, Idx, Kind); + return true; +} Index: lib/Transforms/ParallelIR/LLVMBuild.txt =================================================================== --- lib/Transforms/ParallelIR/LLVMBuild.txt +++ lib/Transforms/ParallelIR/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Passes/LLVMBuild.txt -------------------------------*- Conf -*--===; +;===- ./lib/Transforms/ParallelIR/LLVMBuild.txt ----------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -17,6 +17,7 @@ [component_0] type = Library -name = Passes -parent = Libraries -required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation +name = ParallelIR +parent = Transforms +library_name = ParallelIROpts +required_libraries = Analysis Core InstCombine Support TransformUtils Index: lib/Transforms/ParallelIR/ParallelIR.cpp =================================================================== --- /dev/null +++ lib/Transforms/ParallelIR/ParallelIR.cpp @@ -0,0 +1,22 @@ +//===-- ParallelIR.cpp ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/ParallelIR.h" + +#include "llvm/InitializePasses.h" + +using namespace llvm; + +/// initializeParallelIROptsPasses - Initialize all passes linked into the +/// ParallelIROpts library. +void llvm::initializeParallelIROpts(PassRegistry &Registry) { + initializeParallelIRAttributeAnnotatorLegacyPassPass(Registry); +} Index: test/Other/opt-O3-pipeline.ll =================================================================== --- test/Other/opt-O3-pipeline.ll +++ test/Other/opt-O3-pipeline.ll @@ -27,6 +27,9 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Force set function attributes ; CHECK-NEXT: Infer set function attributes +; CHECK-NEXT: Detect parallel regions +; CHECK-NEXT: Annotate attributes to parallel region +; CHECK-NEXT: Unnamed pass: implement Pass::getPassName() ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Call-site splitting ; CHECK-NEXT: Interprocedural Sparse Conditional Constant Propagation Index: test/Transforms/ParallelIR/kmpc_arg_attributes.ll =================================================================== --- /dev/null +++ test/Transforms/ParallelIR/kmpc_arg_attributes.ll @@ -0,0 +1,139 @@ +; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS +; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR + +; PIR_REGS: Parallel region in main [1]: +; PIR_REGS-NEXT: Parallel Region [5]: +; PIR_REGS-NEXT: fork call: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( +; PIR_REGS-NEXT: sub-function: .omp_outlined. +; PIR_REGS-NEXT: communicated: %c = alloca [100 x float], align 16 : 3 +; PIR_REGS-NEXT: communicated: %a = alloca [100 x float], align 16 : 1 +; PIR_REGS-NEXT: communicated: %b = alloca [100 x float], align 16 : 1 + +; The PIR ATTR check lines below will verify that %a, %b, and %c are annotated +; with noalias, nocapture, and for the first two also readonly. %c should be +; writeonly but it has not the appropriate argument attribute. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1 + +define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 { +entry: + %a = alloca [100 x float], align 16 + %b = alloca [100 x float], align 16 + %c = alloca [100 x float], align 16 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 100 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %conv = sitofp i32 %i.0 to double + %mul = fmul double %conv, 1.000000e+00 + %conv1 = fptrunc double %mul to float + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom + store float %conv1, float* %arrayidx, align 4 + %mul2 = fmul float 2.000000e+00, %conv1 + %idxprom3 = sext i32 %i.0 to i64 + %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom3 + store float %mul2, float* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond +; PIR_ATTR: [100 x float]* noalias nocapture %c, [100 x float]* noalias nocapture readonly %a, [100 x float]* noalias nocapture readonly %b) + call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* %c, [100 x float]* %a, [100 x float]* %b) + %arrayidx5 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 50 + %0 = load float, float* %arrayidx5, align 8 + %conv6 = fpext float %0 to double + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv6) + ret i32 0 +} + +; PIR_ATTR: [100 x float]* noalias nocapture dereferenceable(400) %c, [100 x float]* noalias nocapture readonly dereferenceable(400) %a, [100 x float]* noalias nocapture readonly dereferenceable(400) %b) #0 { +define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [100 x float]* nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 99, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32, i32* %.global_tid., align 4 + call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %1, 99 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %2 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 99, %cond.true ], [ %2, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %3 = load i32, i32* %.omp.lb, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add7, %omp.inner.for.inc ] + %4 = load i32, i32* %.omp.ub, align 4 + %cmp1 = icmp sle i32 %.omp.iv.0, %4 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %mul = mul nsw i32 %.omp.iv.0, 1 + %add = add nsw i32 0, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom + %5 = load float, float* %arrayidx, align 4 + %idxprom2 = sext i32 %add to i64 + %arrayidx3 = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom2 + %6 = load float, float* %arrayidx3, align 4 + %add4 = fadd float %5, %6 + %idxprom5 = sext i32 %add to i64 + %arrayidx6 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 %idxprom5 + store float %add4, float* %arrayidx6, align 4 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %add7 = add nsw i32 %.omp.iv.0, 1 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0) + ret void +} + +declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%ident_t*, i32) + +declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare i32 @printf(i8*, ...) #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/ParallelIR/kmpc_arg_attributes2.ll =================================================================== --- /dev/null +++ test/Transforms/ParallelIR/kmpc_arg_attributes2.ll @@ -0,0 +1,243 @@ +; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS +; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR +; +; PIR_REGS: Parallel region in main [1]: +; PIR_REGS-NEXT: Parallel Region [5]: +; PIR_REGS-NEXT: fork call: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( +; PIR_REGS-NEXT: sub-function: .omp_outlined. +; PIR_REGS-NEXT: communicated: %c = alloca [10 x float], align 16 : 3 +; PIR_REGS-NEXT: communicated: %a = alloca [10 x float], align 16 : 1 +; PIR_REGS-NEXT: communicated: %b = alloca [10 x float], align 16 : 3 + +; The PIR ATTR check lines below will verify that %a, %b, and %c are annotated +; with nocapture, and %a with readonly. %c should be writeonly but it has not +; the appropriate argument attribute. +; Note: noalias is missing due to the potential barriers. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1 + +define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 { +entry: + %a = alloca [10 x float], align 16 + %b = alloca [10 x float], align 16 + %c = alloca [10 x float], align 16 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %conv = sitofp i32 %i.0 to double + %mul = fmul double %conv, 1.000000e+00 + %conv1 = fptrunc double %mul to float + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom + store float %conv1, float* %arrayidx, align 4 + %mul2 = fmul float 2.000000e+00, %conv1 + %idxprom3 = sext i32 %i.0 to i64 + %arrayidx4 = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom3 + store float %mul2, float* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + br label %for.cond5 + +for.cond5: ; preds = %for.inc11, %for.end + %i.1 = phi i32 [ 0, %for.end ], [ %inc12, %for.inc11 ] + %cmp6 = icmp slt i32 %i.1, 10 + br i1 %cmp6, label %for.body8, label %for.end13 + +for.body8: ; preds = %for.cond5 + %idxprom9 = sext i32 %i.1 to i64 + %arrayidx10 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom9 + store float 0.000000e+00, float* %arrayidx10, align 4 + br label %for.inc11 + +for.inc11: ; preds = %for.body8 + %inc12 = add nsw i32 %i.1, 1 + br label %for.cond5 + +for.end13: ; preds = %for.cond5 + br label %for.cond14 + +for.cond14: ; preds = %for.inc18, %for.end13 + %i.2 = phi i32 [ 0, %for.end13 ], [ %inc19, %for.inc18 ] + %cmp15 = icmp slt i32 %i.2, 10 + br i1 %cmp15, label %for.body17, label %for.end20 + +for.body17: ; preds = %for.cond14 +; PIR_ATTR: [10 x float]* nocapture %c, [10 x float]* nocapture readonly %a, [10 x float]* nocapture %b) + call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [10 x float]*, [10 x float]*, [10 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [10 x float]* %c, [10 x float]* %a, [10 x float]* %b) + br label %for.inc18 + +for.inc18: ; preds = %for.body17 + %inc19 = add nsw i32 %i.2, 1 + br label %for.cond14 + +for.end20: ; preds = %for.cond14 + %arrayidx21 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 5 + %0 = load float, float* %arrayidx21, align 4 + %conv22 = fpext float %0 to double + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv22) + ret i32 0 +} + +; Note: noalias attributes are not placed due to the (potential) barriers inside the function! +; +; PIR_ATTR: [10 x float]* nocapture dereferenceable(40) %c, [10 x float]* nocapture readonly dereferenceable(40) %a, [10 x float]* nocapture dereferenceable(40) %b) #0 { +define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [10 x float]* nocapture dereferenceable(40) %c, [10 x float]* nocapture readonly dereferenceable(40) %a, [10 x float]* nocapture dereferenceable(40) %b) #0 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %.omp.lb12 = alloca i32, align 4 + %.omp.ub13 = alloca i32, align 4 + %.omp.stride14 = alloca i32, align 4 + %.omp.is_last15 = alloca i32, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32, i32* %.global_tid., align 4 + call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %1, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %2 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %2, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %3 = load i32, i32* %.omp.lb, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add9, %omp.inner.for.inc ] + %4 = load i32, i32* %.omp.ub, align 4 + %cmp2 = icmp sle i32 %.omp.iv.0, %4 + br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %mul = mul nsw i32 %.omp.iv.0, 1 + %add = add nsw i32 0, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom + %5 = load float, float* %arrayidx, align 4 + %idxprom3 = sext i32 %add to i64 + %arrayidx4 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom3 + %6 = load float, float* %arrayidx4, align 4 + %add5 = fadd float %5, %6 + %idxprom6 = sext i32 %add to i64 + %arrayidx7 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom6 + %7 = load float, float* %arrayidx7, align 4 + %add8 = fadd float %7, %add5 + store float %add8, float* %arrayidx7, align 4 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %add9 = add nsw i32 %.omp.iv.0, 1 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0) + call void @__kmpc_barrier(%ident_t* @1, i32 %0) + store i32 0, i32* %.omp.lb12, align 4 + store i32 9, i32* %.omp.ub13, align 4 + store i32 1, i32* %.omp.stride14, align 4 + store i32 0, i32* %.omp.is_last15, align 4 + call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last15, i32* %.omp.lb12, i32* %.omp.ub13, i32* %.omp.stride14, i32 1, i32 1) + %8 = load i32, i32* %.omp.ub13, align 4 + %cmp17 = icmp sgt i32 %8, 9 + br i1 %cmp17, label %cond.true18, label %cond.false19 + +cond.true18: ; preds = %omp.loop.exit + br label %cond.end20 + +cond.false19: ; preds = %omp.loop.exit + %9 = load i32, i32* %.omp.ub13, align 4 + br label %cond.end20 + +cond.end20: ; preds = %cond.false19, %cond.true18 + %cond21 = phi i32 [ 9, %cond.true18 ], [ %9, %cond.false19 ] + store i32 %cond21, i32* %.omp.ub13, align 4 + %10 = load i32, i32* %.omp.lb12, align 4 + br label %omp.inner.for.cond22 + +omp.inner.for.cond22: ; preds = %omp.inner.for.inc36, %cond.end20 + %.omp.iv10.0 = phi i32 [ %10, %cond.end20 ], [ %add37, %omp.inner.for.inc36 ] + %11 = load i32, i32* %.omp.ub13, align 4 + %cmp23 = icmp sle i32 %.omp.iv10.0, %11 + br i1 %cmp23, label %omp.inner.for.body24, label %omp.inner.for.end38 + +omp.inner.for.body24: ; preds = %omp.inner.for.cond22 + %mul25 = mul nsw i32 %.omp.iv10.0, 1 + %add26 = add nsw i32 0, %mul25 + %idxprom27 = sext i32 %add26 to i64 + %arrayidx28 = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom27 + %12 = load float, float* %arrayidx28, align 4 + %idxprom29 = sext i32 %add26 to i64 + %arrayidx30 = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom29 + %13 = load float, float* %arrayidx30, align 4 + %add31 = fadd float %12, %13 + %idxprom32 = sext i32 %add26 to i64 + %arrayidx33 = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom32 + %14 = load float, float* %arrayidx33, align 4 + %add34 = fadd float %14, %add31 + store float %add34, float* %arrayidx33, align 4 + br label %omp.body.continue35 + +omp.body.continue35: ; preds = %omp.inner.for.body24 + br label %omp.inner.for.inc36 + +omp.inner.for.inc36: ; preds = %omp.body.continue35 + %add37 = add nsw i32 %.omp.iv10.0, 1 + br label %omp.inner.for.cond22 + +omp.inner.for.end38: ; preds = %omp.inner.for.cond22 + br label %omp.loop.exit39 + +omp.loop.exit39: ; preds = %omp.inner.for.end38 + call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0) + call void @__kmpc_barrier(%ident_t* @1, i32 %0) + ret void +} + +declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%ident_t*, i32) + +declare void @__kmpc_barrier(%ident_t*, i32) + +declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare i32 @printf(i8*, ...) #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/ParallelIR/kmpc_noalias_arg.ll =================================================================== --- /dev/null +++ test/Transforms/ParallelIR/kmpc_noalias_arg.ll @@ -0,0 +1,145 @@ +; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS +; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR + +; PIR_REGS: Parallel region in main [1]: +; PIR_REGS-NEXT: Parallel Region [5]: +; PIR_REGS-NEXT: fork call: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call( +; PIR_REGS-NEXT: sub-function: .omp_outlined. +; PIR_REGS-NEXT: communicated: %c = alloca [100 x float], align 16 : 3 +; PIR_REGS-NEXT: communicated: %a = alloca [100 x float], align 16 : 1 +; PIR_REGS-NEXT: communicated: %b = alloca [100 x float], align 16 : 1 + +; The PIR ATTR check lines below will verify that %c is annotated as noalias but +; not %a and %b as they can escape prior to the parallel region. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1 + +@Capture = common global float* null, align 8 +declare void @capture([100 x float] *) + +define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 { +entry: + %a = alloca [100 x float], align 16 + %b = alloca [100 x float], align 16 + %c = alloca [100 x float], align 16 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 100 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %acast = bitcast [100 x float]* %a to float* + store float* %acast, float ** @Capture + %conv = sitofp i32 %i.0 to double + %mul = fmul double %conv, 1.000000e+00 + %conv1 = fptrunc double %mul to float + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom + store float %conv1, float* %arrayidx, align 4 + %mul2 = fmul float 2.000000e+00, %conv1 + %idxprom3 = sext i32 %i.0 to i64 + %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom3 + store float %mul2, float* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + call void @capture([100 x float]* %b) +; PIR_ATTR: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* noalias nocapture %c, [100 x float]* nocapture readonly %a, [100 x float]* nocapture readonly %b) + call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* %c, [100 x float]* %a, [100 x float]* %b) + call void @capture([100 x float]* %c) + %arrayidx5 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 50 + %0 = load float, float* %arrayidx5, align 8 + %conv6 = fpext float %0 to double + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv6) + ret i32 0 +} + +; PIR_ATTR: [100 x float]* noalias nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 { +define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [100 x float]* nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 99, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32, i32* %.global_tid., align 4 + call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %1, 99 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %2 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 99, %cond.true ], [ %2, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %3 = load i32, i32* %.omp.lb, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add7, %omp.inner.for.inc ] + %4 = load i32, i32* %.omp.ub, align 4 + %cmp1 = icmp sle i32 %.omp.iv.0, %4 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %mul = mul nsw i32 %.omp.iv.0, 1 + %add = add nsw i32 0, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom + %5 = load float, float* %arrayidx, align 4 + %idxprom2 = sext i32 %add to i64 + %arrayidx3 = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom2 + %6 = load float, float* %arrayidx3, align 4 + %add4 = fadd float %5, %6 + %idxprom5 = sext i32 %add to i64 + %arrayidx6 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 %idxprom5 + store float %add4, float* %arrayidx6, align 4 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %add7 = add nsw i32 %.omp.iv.0, 1 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0) + ret void +} + +declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%ident_t*, i32) + +declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare i32 @printf(i8*, ...) #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: tools/bugpoint/CMakeLists.txt =================================================================== --- tools/bugpoint/CMakeLists.txt +++ tools/bugpoint/CMakeLists.txt @@ -11,6 +11,7 @@ Instrumentation Linker ObjCARCOpts + ParallelIROpts ScalarOpts Support Target Index: tools/bugpoint/LLVMBuild.txt =================================================================== --- tools/bugpoint/LLVMBuild.txt +++ tools/bugpoint/LLVMBuild.txt @@ -29,5 +29,6 @@ Instrumentation Linker ObjCARC + ParallelIR Scalar all-targets Index: tools/opt/CMakeLists.txt =================================================================== --- tools/opt/CMakeLists.txt +++ tools/opt/CMakeLists.txt @@ -12,6 +12,7 @@ Instrumentation MC ObjCARCOpts + ParallelIROpts ScalarOpts Support Target Index: tools/opt/LLVMBuild.txt =================================================================== --- tools/opt/LLVMBuild.txt +++ tools/opt/LLVMBuild.txt @@ -27,6 +27,7 @@ IRReader IPO Instrumentation + ParallelIR Scalar ObjCARC Passes Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -418,6 +418,7 @@ PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeCore(Registry); initializeCoroutines(Registry); + initializeParallelIROpts(Registry); initializeScalarOpts(Registry); initializeObjCARCOpts(Registry); initializeVectorization(Registry);