Index: include/llvm/Analysis/ParallelIR/KMPCImpl.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/ParallelIR/KMPCImpl.h
@@ -0,0 +1,198 @@
+//===- ParallelIR/KMPCImpl.h - KMPC parallel region impl. -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface of the parallel regions for the OpenMP KMPC runtime library call
+// representation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PARALLELIR_KMPCIMPL_H
+#define LLVM_ANALYSIS_PARALLELIR_KMPCIMPL_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class DominatorTree;
+class KMPC_ParallelRegion;
+
+/// See @p ParallelIRCommunicationInfo
+class KMPC_CommunicationInfo : public ParallelIRCommunicationInfo {
+
+  /// Backwards reference to the parallel region for which this communication
+  /// interface was created.
+  const KMPC_ParallelRegion &PR;
+
+  KMPC_CommunicationInfo(const KMPC_ParallelRegion &PR) : PR(PR) {}
+
+public:
+
+  /// Return the runtime library call that initiates the communication.
+  CallInst &getRTCall() const;
+
+  /// See @p ParallelIRCommunicationInfo::getAllCommunicatingParallelRegions
+  virtual bool getAllCommunicatingParallelRegions(
+      SmallVectorImpl<ParallelRegion *> &CommunicatingParallelRegions)
+      const override;
+
+  /// See @p ParallelIRCommunicationInfo::getNumCommunicatedValues
+  virtual unsigned getNumCommunicatedValues() const override;
+
+  /// See @p ParallelIRCommunicationInfo::getCommunicatedValue
+  virtual Value *getCommunicatedValue(unsigned Idx) const override;
+
+  /// See @p ParallelIRCommunicationInfo::getCommunicationKind
+  virtual CommunicationKind getCommunicationKind(unsigned Idx) const override;
+
+  /// See @p ParallelIRCommunicationInfo::getCommunicatedValues(
+  virtual void getCommunicatedValues(
+      SmallVectorImpl<Value *> &CommunicatedValues) const override;
+
+  /// See @p ParallelIRCommunicationInfo::getCommunicatedValueInParallelRegion
+  virtual Value *
+  getCommunicatedValueInParallelRegion(unsigned Idx) const override;
+
+  /// See @p ParallelIRCommunicationInfo::hasAnnotatableCommunication
+  virtual bool hasAnnotatableCommunication() const override;
+
+  /// See @p ParallelIRCommunicationInfo::hasAttributeInParallelRegion
+  virtual bool
+  hasAttributeInParallelRegion(unsigned Idx,
+                               Attribute::AttrKind Kind) const override;
+
+  friend class KMPC_ParallelRegion;
+};
+
+/// Specialization of the ParallelRegion interface for the OpenMP KMPC runtime
+/// library representation.
+///
+/// Note: This class is abstract as well. It collects the communalities between
+///       KMPC_ForkParallelRegion and KMPC_TaskParallelRegion defined below.
+///
+/// See @p ParallelRegion
+class KMPC_ParallelRegion : public ParallelRegion {
+
+  /// The subfunction that contains the parallel code.
+  Function &ParallelSubFn;
+
+  /// The communication info object for this parallel region.
+  KMPC_CommunicationInfo CommunicationInfo;
+
+public:
+  KMPC_ParallelRegion(CallInst &KMPC_CI, Function &ParallelSubFn,
+                      ParallelIRRegionInfo &PRI)
+      : ParallelRegion(KMPC_CI, KMPC_CI, PRI), ParallelSubFn(ParallelSubFn),
+        CommunicationInfo(*this) {}
+
+  /// Return the parallel subfunction for this parallel region.
+  ///
+  /// Note that there might be multiple regions sharing the same parallel
+  /// subfunction.
+  /// See @p ParallelIRCommunicationInfo::getAllCommunicatingParallelRegions
+  Function &getParallelSubFn() const { return ParallelSubFn; }
+
+  /// See @p ParallelRegion::getFirstInsertionPoint
+  virtual Instruction &getFirstInsertionPoint() const override;
+
+  /// See @p ParallelRegion::getSequentialCodeFunction
+  virtual Function &getSequentialCodeFunction() const override;
+
+  /// See @p ParallelRegion::getParallelCodeFunction
+  virtual Function &getParallelCodeFunction() const override;
+
+  /// See @p ParallelRegion::getDefiniteBarriers
+  virtual void getDefiniteBarriers(
+      SmallVectorImpl<Instruction *> &DefiniteBarriers) const override;
+
+  /// See @p ParallelRegion::getPotentialBarriers
+  virtual void getPotentialBarriers(
+      SmallVectorImpl<Instruction *> &PotentialBarriers) const override;
+
+  /// See @p ParallelRegion::getThreadId
+  virtual Value *getThreadId() const override;
+
+  /// See @p ParallelRegion::getLocalThreadId
+  virtual Value *getLocalThreadId() const override;
+
+  /// See @p ParallelRegion::contains
+  virtual bool contains(const BasicBlock *BB,
+                        const DominatorTree *) const override;
+
+  /// See @p ParallelRegion::contains
+  virtual bool contains(const Instruction *I,
+                        const DominatorTree *) const override;
+
+  /// See @p ParallelRegion::visit
+  virtual bool visit(InstructionVisitorTy &Visitor) const override;
+
+  /// See @p ParallelRegion::visit
+  virtual bool visit(BlockVisitorTy &Visitor) const override;
+
+  /// See @p ParallelRegion::print
+  virtual void print(raw_ostream &OS, unsigned indent) const override;
+
+  /// See @p ParallelRegion::getCommunicationInfo
+  virtual const ParallelIRCommunicationInfo &
+  getCommunicationInfo() const override {
+    return CommunicationInfo;
+  }
+};
+
+/// See @p KMPC_ParallelRegion
+class KMPC_ForkParallelRegion : public KMPC_ParallelRegion {
+
+  /// Private constructor, generation via findKMPCForkCalls.
+  KMPC_ForkParallelRegion(CallInst &KMPC_ForkCI, Function &ParallelSubFn,
+                          ParallelIRRegionInfo &PRI);
+
+public:
+
+  /// See @p ParallelRegion::getKind
+  virtual ParallelRegionKind getKind() const override {
+    return PRK_KMPC_FORK_RT;
+  }
+
+  /// See @p ParallelRegion::isKind
+  virtual bool isKind(ParallelRegionKind Kind) const override {
+    return Kind | PRK_KMPC_FORK_RT;
+  }
+
+  /// Find all KMPC fork calls in @p M and register the KMPC_ForkParallelRegion
+  /// parallel regions with @p PRI.
+  static void findKMPCForkCalls(Module &M, ParallelIRRegionInfo &PRI);
+};
+
+/// See @p KMPC_ParallelRegion
+class KMPC_TaskParallelRegion : public KMPC_ParallelRegion {
+
+  /// Private constructor, generation via findKMPCTaskCalls.
+  KMPC_TaskParallelRegion(CallInst &KMPC_TaskCI, Function &ParallelSubFn,
+                          ParallelIRRegionInfo &PRI);
+
+public:
+
+  /// See @p ParallelRegion::getKind
+  virtual ParallelRegionKind getKind() const override {
+    return PRK_KMPC_TASK_RT;
+  }
+
+  /// See @p ParallelRegion::isKind
+  virtual bool isKind(ParallelRegionKind Kind) const override {
+    return Kind | PRK_KMPC_TASK_RT;
+  }
+
+  /// Find all KMPC task calls in @p M and register the KMPC_ForkParallelRegion
+  /// parallel regions with @p PRI.
+  static void findKMPCTaskCalls(Module &M, ParallelIRRegionInfo &PRI);
+};
+
+} // namespace llvm
+#endif
Index: include/llvm/Analysis/ParallelIR/RegionInfo.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/ParallelIR/RegionInfo.h
@@ -0,0 +1,348 @@
+//===- ParallelIRRegionInfo.h - Parallel region analysis --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Abstract analyses interfaces to inspect parallel codes and passes to provide
+// information about parallelism inside the module.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PARALLELIR_REGIONINFO_H
+#define LLVM_ANALYSIS_PARALLELIR_REGIONINFO_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class ParallelRegion;
+class ParallelIRBuilder;
+
+/// Helper structure that decouples communication related queries from the
+/// parallel region interface.
+///
+/// Communication is conceptually divided into two parts, the sequential and the
+/// parallel one. The communicated values could be different on the other side
+/// and they are therefore only identified by their index. The way values can be
+/// communicated is either by-value or inside a container, thus through a
+/// pointer type that is dereferenced.
+struct ParallelIRCommunicationInfo {
+
+  /// Return all parallel regions that might be involved with either side of
+  /// the parallel communication interface. There might be multiple if code was
+  /// reused or parts of the interface were duplicated, e.g., though unrolling.
+  virtual bool getAllCommunicatingParallelRegions(
+      SmallVectorImpl<ParallelRegion *> &CommunicatingParallelRegions)
+      const = 0;
+
+  /// Flags to distinguish the different kinds of communication.
+  enum CommunicationKind {
+    CK_VALUE,            ///< communication by-value
+    CK_CONTAINER_IN,     ///< communication through a read-only container
+    CK_CONTAINER_OUT,    ///< communication through a write-only container
+    CK_CONTAINER_IN_OUT, ///< communication through a container
+    CK_UNKNOWN,          ///< unknown/complication communication
+  };
+
+  /// Return the number of communicated values.
+  virtual unsigned getNumCommunicatedValues() const = 0;
+
+  /// Return the communicated value number @p Idx in the sequential code.
+  virtual Value *getCommunicatedValue(unsigned Idx) const = 0;
+
+  /// Return the communicated value number @p Idx in the parallel code.
+  virtual Value *getCommunicatedValueInParallelRegion(unsigned Idx) const = 0;
+
+  /// Return the kind of communication used for value number @p Idx.
+  virtual CommunicationKind getCommunicationKind(unsigned Idx) const = 0;
+
+  /// Return all communicated values in the sequential code.
+  virtual void getCommunicatedValues(
+      SmallVectorImpl<Value *> &CommunicatedValues) const = 0;
+
+  /// Return true if the communication can be annotated with attributes.
+  virtual bool hasAnnotatableCommunication() const = 0;
+
+  /// Return true if the communicated value @p Idx in the parallel code has
+  /// attribute @p Kind. If attribute annotation is not possible this function
+  /// shall gracefully return false.
+  virtual bool hasAttributeInParallelRegion(unsigned Idx,
+                                            Attribute::AttrKind Kind) const = 0;
+};
+
+/// The parallel region info (PRI) identifies parallel regions and provides
+/// convenient information on them.
+///
+/// Currently the parallel region info is "lazy" in the sense that it does only
+/// need to be updated if new parallel regions are created (or deleted). As this
+/// should not happen very often (and only in very few places) it allows
+/// transformation passes to preserve the parallel region info without
+/// modifications. Additionally, it makes the analysis very lightweight in the
+/// absence of parallel regions (which should be the majority of functions).
+///
+class ParallelIRRegionInfo {
+public:
+
+  /// Container type for parallel regions.
+  using ParallelRegionContainer = SmallVector<ParallelRegion *, 4>;
+  using ParallelRegionContainerMap =
+      DenseMap<Function *, ParallelRegionContainer>;
+
+  /// Iterator types for the parallel region container.
+  using iterator = ParallelRegionContainerMap::iterator;
+  using const_iterator = ParallelRegionContainerMap::const_iterator;
+
+private:
+
+  /// The parallel regions discovered in the program.
+  ParallelRegionContainerMap ParallelRegionsMap;
+
+  /// Register the parallel region @p PR.
+  void addParallelRegion(ParallelRegion &PR);
+
+public:
+  ParallelIRRegionInfo() {}
+  ParallelIRRegionInfo(Module &M) {
+    recalculate(M);
+  }
+  ~ParallelIRRegionInfo() { releaseMemory(); }
+
+  /// Identify the parallel regions in @p M from scratch.
+  void recalculate(Module &M);
+
+  /// Return the parallel region for @p I if any.
+  ParallelRegion *getParallelRegionFor(Instruction *I) const;
+
+  /// Return a vector with all parallel regions in this function.
+  ///
+  ///{
+  ParallelRegionContainer getParallelRegions(Function &F) const {
+    return ParallelRegionsMap.lookup(&F);
+  }
+  ///}
+
+  /// Iterators to visit all parallel regions, function by function
+  ///
+  ///{
+  iterator begin() { return ParallelRegionsMap.begin(); }
+  iterator end() { return ParallelRegionsMap.end(); }
+
+  const_iterator begin() const { return ParallelRegionsMap.begin(); }
+  const_iterator end() const { return ParallelRegionsMap.end(); }
+  ///}
+
+  /// Delete all memory allocated for parallel regions.
+  void releaseMemory();
+
+  /// Pretty print all parallel regions.
+  ///{
+  void print(raw_ostream &OS) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
+  ///}
+
+  friend class ParallelRegion;
+  friend class ParallelIRBuilder;
+};
+
+/// A parallel region is a single-entry, single-exit CFG region that
+/// represents code that can be executed in parallel.
+class ParallelRegion {
+
+  /// The start point of this parallel region.
+  Instruction &StartPoint;
+
+  /// The end point of this parallel region.
+  Instruction &EndPoint;
+
+  /// The parallel region info analysis.
+  ParallelIRRegionInfo &PRI;
+
+protected:
+
+  ParallelRegion(Instruction &StartPoint, Instruction &EndPoint,
+                 ParallelIRRegionInfo &PRI)
+      : StartPoint(StartPoint), EndPoint(EndPoint), PRI(PRI) {
+    // Register a new parallel region always with the parallel region info.
+    PRI.addParallelRegion(*this);
+  }
+
+public:
+  virtual ~ParallelRegion();
+
+  /// Return the start point of this parallel region.
+  Instruction &getStartPoint() const { return StartPoint; }
+
+  /// Return the end point of this parallel region.
+  Instruction &getEndPoint() const { return EndPoint; }
+
+  /// Return the first instruction in this parallel region before which new code
+  /// can be inserted.
+  virtual Instruction &getFirstInsertionPoint() const = 0;
+
+  /// Return the function that contains the sequential code surrounding the
+  /// parallel region.
+  virtual Function &getSequentialCodeFunction() const = 0;
+
+  /// Return the function that contains the code executed in parallel.
+  virtual Function &getParallelCodeFunction() const = 0;
+
+  /// Return all definite barrier instructions.
+  virtual void getDefiniteBarriers(
+      SmallVectorImpl<Instruction *> &DefiniteBarriers) const = 0;
+
+  /// Return all potential barrier instructions.
+  virtual void getPotentialBarriers(
+      SmallVectorImpl<Instruction *> &PotentialBarriers) const = 0;
+
+  /// Return true if @p I might have barrier semantics for this parallel region.
+  virtual bool isPotentialBarrier(Instruction &I) const;
+
+  /// Enumeration of all known parallel region kinds.
+  enum ParallelRegionKind {
+    PRK_KMPC_RT = 4,      ///< General KPMC runtime call
+    PRK_KMPC_FORK_RT = 5, ///< KPMC fork runtime call
+    PRK_KMPC_TASK_RT = 6, ///< KPMC task runtime call
+  };
+
+  /// Return the kind of this parallel region.
+  virtual ParallelRegionKind getKind() const = 0;
+
+  /// Return true if this parallel region is of kind @p Kind.
+  virtual bool isKind(ParallelRegionKind Kind) const = 0;
+
+  /// Return the global thread id if applicable and present.
+  ///
+  /// Note: The ParallelIR/Builder interface allows to create a new thread id.
+  virtual Value *getThreadId() const = 0;
+
+  /// Return the local thread id if applicable and present.
+  ///
+  /// Note: The ParallelIR/Builder interface allows to create a new thread id.
+  virtual Value *getLocalThreadId() const = 0;
+
+  /// Return a lightweight communication info object for this parallel region.
+  virtual const ParallelIRCommunicationInfo &getCommunicationInfo() const = 0;
+
+  /// Type of the instruction visitor function.
+  ///
+  /// It will be invoked for every instruction in this parallel region until the
+  /// return value of the visitor is false. Note that only proper instructions
+  /// inside the parallel region are visited, thus no encoding instructions
+  /// only present to mark the parallel region.
+  ///
+  /// The return value indicates if the traversal should continue.
+  using InstructionVisitorTy = std::function<bool(Instruction &)>;
+
+  /// Type of the block visitor function.
+  ///
+  /// It will be invoked for every basic block in this parallel region until the
+  /// return value of the visitor is false. The second argument is true only if
+  /// the block is not completely contained in the parallel region.
+  ///
+  /// The return value indicates if the traversal should continue.
+  using BlockVisitorTy = std::function<bool(BasicBlock &, bool /* boundary */)>;
+
+  /// A generic visitor interface as an alternative to an iterator.
+  ///
+  /// @returns True, if all instructions/blocks have been visited.
+  ///{
+  virtual bool visit(InstructionVisitorTy &Visitor) const = 0;
+  virtual bool visit(BlockVisitorTy &Visitor) const = 0;
+  ///}
+
+  /// The contain interface is designed deliberately different from similar
+  /// functions like Loop::contains(*) as it might take a dominator tree as a
+  /// second argument. This allows the ParallelRegion to remain valid even if
+  /// transformations change the CFG structure inside. As a consequence there
+  /// are less modifications needed in the existing code base.
+  ///{
+  virtual bool contains(const BasicBlock *BB,
+                        const DominatorTree *DT = nullptr) const;
+  virtual bool contains(const Instruction *I,
+                        const DominatorTree *DT = nullptr) const;
+  ///}
+
+  const ParallelIRRegionInfo &getParallelRegionInfo() const { return PRI; };
+
+  /// Pretty print this parallel region.
+  ///{
+  virtual void print(raw_ostream &OS, unsigned indent = 0) const = 0;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
+  ///}
+
+  friend class ParallelIRRegionInfo;
+};
+
+/// Pretty print the parallel region @p PR to @p OS.
+inline raw_ostream &operator<<(raw_ostream &OS, const ParallelRegion &PR) {
+  PR.print(OS);
+  return OS;
+}
+
+/// New pass manager wrapper pass around the parallel region info.
+class ParallelIRRegionAnalysis
+    : public AnalysisInfoMixin<ParallelIRRegionAnalysis> {
+  friend AnalysisInfoMixin<ParallelIRRegionAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  typedef ParallelIRRegionInfo Result;
+
+  /// Run the analysis pass over a module and identify the parallel regions.
+  ///
+  /// FIXME: This does not need to be a module pass but dependent passes in the
+  /// old pass manager do not work otherwise.
+  ParallelIRRegionInfo run(Module &M, ModuleAnalysisManager &MAM);
+};
+
+/// Module pass wrapper around the parallel region info.
+///
+/// FIXME: This does not need to be a module pass but dependent passes in the
+/// old pass manager do not work otherwise.
+class ParallelIRRegionInfoPass : public ModulePass {
+  ParallelIRRegionInfo PRI;
+
+public:
+  static char ID;
+  ParallelIRRegionInfoPass() : ModulePass(ID) {}
+
+  /// Return the parallel region info analysis.
+  ///{
+  ParallelIRRegionInfo &getParallelIRRegionInfo() { return PRI; }
+  const ParallelIRRegionInfo &getParallelIRRegionInfo() const { return PRI; }
+  ///}
+
+  /// Initialize the parallel region info for this function.
+  bool runOnModule(Module &) override;
+
+  /// Verify the analysis as well as some of the functions provided.
+  void verifyAnalysis() const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Pretty print the parallel regions of the function.
+  ///{
+  void print(raw_ostream &OS, const Module *) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
+  ///}
+};
+
+} // End llvm namespace
+#endif
Index: include/llvm/Analysis/Passes.h
===================================================================
--- include/llvm/Analysis/Passes.h
+++ include/llvm/Analysis/Passes.h
@@ -79,6 +79,13 @@
   //
   FunctionPass *createRegionInfoPass();
 
+  //===--------------------------------------------------------------------===//
+  //
+  // createParallelRegionInfoPass - This pass finds all parallel regions
+  // in a function.
+  //
+  ModulePass *createParallelIRRegionInfoPass();
+
   // Print module-level debug info metadata in human-readable form.
   ModulePass *createModuleDebugInfoPrinterPass();
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -61,6 +61,9 @@
 /// Initialize all passes linked into the CodeGen library.
 void initializeTarget(PassRegistry&);
 
+/// Initialize all passes linked into the ParallelIROpts library.
+void initializeParallelIROpts(PassRegistry&);
+
 void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
@@ -285,12 +288,14 @@
 void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePAEvalPass(PassRegistry&);
+void initializeParallelIRRegionInfoPassPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
 void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&);
 void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&);
 void initializePGOInstrumentationUseLegacyPassPass(PassRegistry&);
 void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry&);
 void initializePHIEliminationPass(PassRegistry&);
+void initializeParallelIRAttributeAnnotatorLegacyPassPass(PassRegistry&);
 void initializePartialInlinerLegacyPassPass(PassRegistry&);
 void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&);
 void initializePatchableFunctionPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -47,6 +47,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/ParallelIR.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils.h"
@@ -188,6 +189,8 @@
       (void) llvm::createMergeFunctionsPass();
       (void) llvm::createMergeICmpsPass();
       (void) llvm::createExpandMemCmpPass();
+      (void) llvm::createParallelIRRegionInfoPass();
+      (void) llvm::createParallelIRAttributeAnnotatorLegacyPass();
       std::string buf;
       llvm::raw_string_ostream os(buf);
       (void) llvm::createPrintModulePass(os);
Index: include/llvm/Transforms/ParallelIR.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/ParallelIR.h
@@ -0,0 +1,28 @@
+//===-- ParallelIR.h - Parallel IR Transformations --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the ParallelIR transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_PARALLEL_IR_H
+#define LLVM_TRANSFORMS_PARALLEL_IR_H
+
+namespace llvm {
+
+class ModulePass;
+
+//===----------------------------------------------------------------------===//
+//
+ModulePass *createParallelIRAttributeAnnotatorLegacyPass();
+
+} // End llvm namespace
+
+#endif
Index: include/llvm/Transforms/ParallelIR/AttributeAnnotator.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/ParallelIR/AttributeAnnotator.h
@@ -0,0 +1,30 @@
+//===- AttributeAnnotator.h ----- Annotate attr. from/to parallel regions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_PARALLEL_IR_ATTRIBUTE_ANNOTATOR_H
+#define LLVM_TRANSFORMS_PARALLEL_IR_ATTRIBUTE_ANNOTATOR_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+
+struct ParallelIRAttributeAnnotatorPass
+    : PassInfoMixin<ParallelIRAttributeAnnotatorPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_PARALLEL_IR_REGION_MERGE_H
Index: include/llvm/Transforms/ParallelIR/Builder.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/ParallelIR/Builder.h
@@ -0,0 +1,50 @@
+//===- ParallelIR/Builder.h - Parallel region IR builder ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_PARALLELIR_BUILDER_H
+#define LLVM_IR_PARALLELIR_BUILDER_H
+
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
+
+namespace llvm {
+
+/// Interface to modify and create parallel regions. The potentially different
+/// implementation of this interface for the different kinds of parallel regions
+/// will apply the modifications requested through these calls if that is
+/// possible. If not, they shall gracefully ignore the request.
+struct ParallelIRBuilder {
+
+  /// Create a parallel IR builder for the region kind @p PRKind.
+  static ParallelIRBuilder *Create(ParallelIRRegionInfo &PRI,
+                                   ParallelRegion::ParallelRegionKind PRKind);
+
+  /// Add the attribute @p Kind to the communicated value at index @p Idx in the
+  /// sequential part of the communication interface defined by @p PRCI.
+  virtual bool
+  addAttributeInSequentialRegion(const ParallelIRCommunicationInfo &PRCI,
+                                 unsigned Idx,
+                                 Attribute::AttrKind Kind) const = 0;
+
+  /// Add the attribute @p Kind to the communicated value at index @p Idx in the
+  /// parallel part of the communication interface defined by @p PRCI.
+  virtual bool
+  addAttributeInParallelRegion(const ParallelIRCommunicationInfo &PRCI,
+                               unsigned Idx,
+                               Attribute::AttrKind Kind) const = 0;
+
+  /// Add the attribute @p Kind to the communicated value at index @p Idx in
+  /// both parts of the communication interface defined by @p PRCI.
+  virtual bool addAttribute(const ParallelIRCommunicationInfo &PRCI,
+                            unsigned Idx, Attribute::AttrKind Kind) const = 0;
+};
+
+}
+#endif
Index: lib/Analysis/Analysis.cpp
===================================================================
--- lib/Analysis/Analysis.cpp
+++ lib/Analysis/Analysis.cpp
@@ -82,6 +82,7 @@
   initializeLCSSAVerificationPassPass(Registry);
   initializeMemorySSAWrapperPassPass(Registry);
   initializeMemorySSAPrinterLegacyPassPass(Registry);
+  initializeParallelIRRegionInfoPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -87,6 +87,9 @@
   ValueTracking.cpp
   VectorUtils.cpp
 
+  ParallelIR/RegionInfo.cpp
+  ParallelIR/KMPCImpl.cpp
+
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis
 
Index: lib/Analysis/ParallelIR/KMPCImpl.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/ParallelIR/KMPCImpl.cpp
@@ -0,0 +1,282 @@
+//===- KMPCImpl.cpp - OpenMP runtime (KMPC) parallel region impl. ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the parallel regions for the OpenMP KMPC runtime library
+// call representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ParallelIR/KMPCImpl.h"
+
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "parallel-region-info"
+
+//===----------------------------------------------------------------------===//
+// KMPC runtime parallel region implementation
+//
+
+/// Return true if @p V is a call to the function @p Name in @p F.
+static bool isCallToFunctionIn(Value *V, Function *CalledDecl, Function *F) {
+  auto *CI = dyn_cast<CallInst>(V);
+  return (CI && CI->getCalledFunction() == CalledDecl &&
+          (!F || CI->getFunction() == F));
+}
+
+/// Put all calls to @p Name in @p F into the container @p Calls as @p RetTy.
+template<typename RetTy>
+static void collectCallsToInFunction(std::string Name, Function *F, Module &M,
+                                     SmallVectorImpl<RetTy *> &Calls) {
+
+  // Look for the "Name" function declaration in the Module. If found, the users
+  // are possible calls.
+  Function *FunctionDecl = M.getFunction(Name);
+  if (!FunctionDecl)
+    return;
+
+  for (User *U : FunctionDecl->users())
+    if (isCallToFunctionIn(U, FunctionDecl, F))
+      Calls.push_back(cast<RetTy>(U));
+}
+
+Instruction &KMPC_ParallelRegion::getFirstInsertionPoint() const {
+    return ParallelSubFn.getEntryBlock().front();
+  }
+
+Function &KMPC_ParallelRegion::getSequentialCodeFunction() const {
+  return *getStartPoint().getFunction();
+}
+
+Function &KMPC_ParallelRegion::getParallelCodeFunction() const {
+  return ParallelSubFn;
+}
+
+void KMPC_ParallelRegion::getDefiniteBarriers(
+    SmallVectorImpl<Instruction *> &DefiniteBarriers) const  {
+  collectCallsToInFunction("__kmpc_barrier", &ParallelSubFn,
+                            *ParallelSubFn.getParent(), DefiniteBarriers);
+}
+
+void KMPC_ParallelRegion::getPotentialBarriers(
+    SmallVectorImpl<Instruction *> &PotentialBarriers) const  {
+  InstructionVisitorTy BarrierCollector = [&](Instruction &I) {
+    if (!ParallelRegion::isPotentialBarrier(I))
+      return true;
+
+    CallInst *CI = dyn_cast<CallInst>(&I);
+    if (!CI)
+      return true;
+
+    if (CI->getCalledFunction()) {
+      const StringRef &Name = CI->getCalledFunction()->getName();
+      if (Name == "__kmpc_for_static_init_4" ||
+          Name == "__kmpc_for_static_fini")
+        return true;
+    }
+
+    PotentialBarriers.push_back(&I);
+    return true;
+  };
+
+  visit(BarrierCollector);
+}
+
+Value *KMPC_ParallelRegion::getThreadId() const  {
+  Value *ThreadIdPtr = ParallelSubFn.arg_begin();
+
+  for (Value *User : ThreadIdPtr->users())
+    if (LoadInst *LI = dyn_cast<LoadInst>(User))
+      return LI;
+
+  return nullptr;
+}
+
+Value *KMPC_ParallelRegion::getLocalThreadId() const  {
+  Value *LocalThreadIdPtr = ParallelSubFn.arg_begin() + 1;
+
+  for (Value *User : LocalThreadIdPtr->users())
+    if (LoadInst *LI = dyn_cast<LoadInst>(User))
+      return LI;
+
+  return nullptr;
+}
+
+bool KMPC_ParallelRegion::contains(const BasicBlock *BB,
+                      const DominatorTree *) const  {
+  return BB->getParent() == &ParallelSubFn;
+}
+
+bool KMPC_ParallelRegion::contains(const Instruction *I,
+                      const DominatorTree *) const  {
+  return I->getFunction() == &ParallelSubFn || I == &getStartPoint();
+}
+
+bool KMPC_ParallelRegion::visit(InstructionVisitorTy &Visitor) const  {
+  for (BasicBlock &BB : ParallelSubFn)
+    for (Instruction &I : BB)
+      if (!Visitor(I))
+        return false;
+  return true;
+}
+
+bool KMPC_ParallelRegion::visit(BlockVisitorTy &Visitor) const  {
+  for (BasicBlock &BB : ParallelSubFn)
+    if (!Visitor(BB, false))
+      return false;
+  return true;
+}
+
+void KMPC_ParallelRegion::print(raw_ostream &OS, unsigned indent) const  {
+  OS.indent(indent) << "Parallel Region [" << getKind() << "]:\n";
+  OS.indent(indent) << "     fork call: " << getStartPoint() << "\n";
+  OS.indent(indent) << "  sub-function: " << ParallelSubFn.getName()
+                    << "\n";
+
+  const ParallelIRCommunicationInfo &CI = getCommunicationInfo();
+  for (unsigned u = 0; u < CI.getNumCommunicatedValues(); u++)
+    OS.indent(indent) << "  communicated: " << *CI.getCommunicatedValue(u)
+                      << " : " << CI.getCommunicationKind(u) << "\n";
+}
+
+//===----------------------------------------------------------------------===//
+// KMPC_TaskParallelRegion implementation
+//
+
+KMPC_TaskParallelRegion::KMPC_TaskParallelRegion(CallInst &KMPC_TaskCI,
+                                                 Function &ParallelSubFn,
+                                                 ParallelIRRegionInfo &PRI)
+    : KMPC_ParallelRegion(KMPC_TaskCI, ParallelSubFn, PRI) {}
+
+void KMPC_TaskParallelRegion::findKMPCTaskCalls(
+    Module &M, ParallelIRRegionInfo &PRI) {
+  SmallVector<CallInst *, 8> KMPC_TaskCalls;
+  collectCallsToInFunction("__kmpc_omp_task", nullptr, M, KMPC_TaskCalls);
+
+  // Calls of the "__kmpc_omp_task" function are actually parallel regions.
+  for (CallInst *CI : KMPC_TaskCalls) {
+
+    Function *ParallelFunc =
+        cast<Function>(CI->getArgOperand(2)->stripPointerCasts());
+    assert(ParallelFunc);
+
+    new KMPC_TaskParallelRegion(*CI, *ParallelFunc, PRI);
+  }
+}
+
+void findKMPCTaskCalls(Module &M, ParallelIRRegionInfo &PRI) {
+  KMPC_TaskParallelRegion::findKMPCTaskCalls(M, PRI);
+}
+
+//===----------------------------------------------------------------------===//
+// KMPC_ForkParallelRegion implementation
+//
+
+KMPC_ForkParallelRegion::KMPC_ForkParallelRegion(CallInst &KMPC_ForkCI,
+                                                 Function &ParallelSubFn,
+                                                 ParallelIRRegionInfo &PRI)
+    : KMPC_ParallelRegion(KMPC_ForkCI, ParallelSubFn, PRI) {}
+
+void KMPC_ForkParallelRegion::findKMPCForkCalls(
+    Module &M, ParallelIRRegionInfo &PRI) {
+  SmallVector<CallInst *, 8> KMPC_ForkCalls;
+  collectCallsToInFunction("__kmpc_fork_call", nullptr, M, KMPC_ForkCalls);
+
+  // Calls of the "__kmpc_fork_call" function are actually parallel regions.
+  for (CallInst *CI : KMPC_ForkCalls) {
+
+    Function *ParallelFunc =
+        cast<Function>(CI->getArgOperand(2)->stripPointerCasts());
+    assert(ParallelFunc);
+
+    new KMPC_ForkParallelRegion(*CI, *ParallelFunc, PRI);
+  }
+}
+
+void findKMPCForkCalls(Module &M, ParallelIRRegionInfo &PRI) {
+  KMPC_ForkParallelRegion::findKMPCForkCalls(M, PRI);
+}
+
+//===----------------------------------------------------------------------===//
+// CommunicationInfo implementation
+//
+
+CallInst &KMPC_CommunicationInfo::getRTCall() const {
+  return cast<CallInst>(PR.getStartPoint());
+}
+
+bool KMPC_CommunicationInfo::getAllCommunicatingParallelRegions(
+    SmallVectorImpl<ParallelRegion *> &CommunicatingParallelRegions) const {
+
+  Function &ParallelFn = PR.getParallelSubFn();
+  const ParallelIRRegionInfo &PRI = PR.getParallelRegionInfo();
+  for (const auto &It : PRI)
+    for (ParallelRegion *PR : It.getSecond())
+      if (&PR->getParallelCodeFunction() == &ParallelFn)
+        CommunicatingParallelRegions.push_back(PR);
+
+  return true;
+}
+
+unsigned KMPC_CommunicationInfo::getNumCommunicatedValues() const {
+  CallInst &CI = getRTCall();
+  return CI.getNumArgOperands() - 3;
+}
+
+Value *KMPC_CommunicationInfo::getCommunicatedValue(unsigned Idx) const {
+  CallInst &CI = getRTCall();
+  return CI.getArgOperand(Idx + 3);
+}
+
+ParallelIRCommunicationInfo::CommunicationKind
+KMPC_CommunicationInfo::getCommunicationKind(unsigned Idx) const {
+  Value *CommunicatedValue = getCommunicatedValue(Idx);
+  if (!CommunicatedValue->getType()->isPointerTy())
+    return CK_VALUE;
+
+  Argument *CommunicatedValueArg =
+      cast<Argument>(getCommunicatedValueInParallelRegion(Idx));
+  if (!CommunicatedValueArg->hasNoCaptureAttr())
+    return CK_VALUE;
+
+  if (CommunicatedValueArg->hasAttribute(Attribute::ReadNone))
+    return CK_VALUE;
+
+  if (CommunicatedValueArg->hasAttribute(Attribute::WriteOnly))
+    return CK_CONTAINER_OUT;
+
+  if (CommunicatedValueArg->hasAttribute(Attribute::ReadOnly))
+    return CK_CONTAINER_IN;
+
+  return CK_CONTAINER_IN_OUT;
+}
+
+void KMPC_CommunicationInfo::getCommunicatedValues(
+    SmallVectorImpl<Value *> &CommunicatedValues) const {
+  CallInst &CI = getRTCall();
+  for (unsigned u = 3, e = CI.getNumArgOperands(); u < e; u++)
+    CommunicatedValues.push_back(CI.getArgOperand(u));
+}
+
+Value *KMPC_CommunicationInfo::getCommunicatedValueInParallelRegion(
+    unsigned Idx) const {
+  return PR.getParallelSubFn().arg_begin() + 2 + Idx;
+}
+
+bool KMPC_CommunicationInfo::hasAnnotatableCommunication() const {
+  return PR.getKind() == ParallelRegion::PRK_KMPC_FORK_RT;
+}
+
+bool KMPC_CommunicationInfo::hasAttributeInParallelRegion(unsigned Idx,
+                                            Attribute::AttrKind Kind) const {
+  return cast<Argument>(getCommunicatedValueInParallelRegion(Idx))->hasAttribute(Kind);
+}
Index: lib/Analysis/ParallelIR/RegionInfo.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/ParallelIR/RegionInfo.cpp
@@ -0,0 +1,174 @@
+//===- ParallelIRRegionInfo.cpp - Parallel region detection analysis
+//--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the ParallelIR/RegionInfo analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
+
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "parallel-region-info"
+
+STATISTIC(NumParallelRegions, "The # of parallel regions");
+
+//===----------------------------------------------------------------------===//
+// ParallelRegion implementation
+//
+
+ParallelRegion::~ParallelRegion() {}
+
+bool ParallelRegion::isPotentialBarrier(Instruction &I) const {
+
+  if (!isa<CallInst>(I))
+    return false;
+
+  if (isa<IntrinsicInst>(I))
+    return false;
+
+  if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
+    return false;
+
+  return true;
+}
+
+bool ParallelRegion::contains(const BasicBlock *BB,
+                              const DominatorTree *DT) const {
+  bool Contains = false;
+
+  // Fallback to a search of all blocks in this task.
+  BlockVisitorTy BBVisitor = [BB, &Contains](BasicBlock &CurrentBB,
+                                             bool Boundary) {
+    if (BB != &CurrentBB)
+      return true;
+    Contains = !Boundary;
+    return false;
+  };
+
+  visit(BBVisitor);
+  return Contains;
+}
+
+bool ParallelRegion::contains(const Instruction *I,
+                              const DominatorTree *DT) const {
+  bool Contains = false;
+
+  // Fallback to a search of all blocks in this task.
+  InstructionVisitorTy InstVisitor = [I, &Contains](Instruction &CurI) {
+    if (I != &CurI)
+      return true;
+    Contains = true;
+    return false;
+  };
+
+  visit(InstVisitor);
+  return Contains;
+}
+
+void ParallelRegion::dump() const { return print(dbgs()); }
+
+//===----------------------------------------------------------------------===//
+// ParallelIR/RegionInfo implementation
+//
+
+void ParallelIRRegionInfo::addParallelRegion(ParallelRegion &PR) {
+  NumParallelRegions++;
+  ParallelRegionsMap[PR.getStartPoint().getFunction()].push_back(&PR);
+}
+
+void ParallelIRRegionInfo::print(raw_ostream &OS) const {
+  for (auto &It : ParallelRegionsMap) {
+    assert(It.second.size());
+    OS << "Parallel region in " << It.first->getName() << " ["
+       << It.second.size() << "]:\n";
+    for (auto *PR : It.second)
+      PR->print(OS);
+  }
+}
+
+void ParallelIRRegionInfo::dump() const { print(dbgs()); }
+
+void ParallelIRRegionInfo::releaseMemory() {
+  for (auto &It : ParallelRegionsMap)
+    DeleteContainerPointers(It.second);
+  ParallelRegionsMap.clear();
+}
+
+void findKMPCForkCalls(Module &, ParallelIRRegionInfo &);
+
+void ParallelIRRegionInfo::recalculate(Module &M) {
+  releaseMemory();
+
+  bool RecognizeKMPCFork = true;
+
+  if (RecognizeKMPCFork)
+    findKMPCForkCalls(M, *this);
+
+}
+
+ParallelRegion *ParallelIRRegionInfo::getParallelRegionFor(Instruction *I) const {
+  for (ParallelRegion *PR : getParallelRegions(*I->getFunction()))
+    if (PR->contains(I))
+      return PR;
+  return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// ParallelRegionAnalysis implementation
+//
+
+AnalysisKey ParallelIRRegionAnalysis::Key;
+
+ParallelIRRegionInfo
+ParallelIRRegionAnalysis::run(Module &M, ModuleAnalysisManager &MAM) {
+  return ParallelIRRegionInfo(M);
+}
+
+//===----------------------------------------------------------------------===//
+// ParallelIRRegionInfoPass implementation
+//
+
+bool ParallelIRRegionInfoPass::runOnModule(Module &M) {
+  PRI.recalculate(M);
+  return false;
+}
+
+void ParallelIRRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+void ParallelIRRegionInfoPass::print(raw_ostream &OS, const Module *) const {
+  PRI.print(OS);
+}
+
+void ParallelIRRegionInfoPass::verifyAnalysis() const {
+  // TODO Not implemented but merely a stub.
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ParallelIRRegionInfoPass::dump() const { PRI.dump(); }
+#endif
+
+char ParallelIRRegionInfoPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ParallelIRRegionInfoPass, "pir-regions",
+                      "Detect parallel regions", false, true)
+INITIALIZE_PASS_END(ParallelIRRegionInfoPass, "pir-regions",
+                    "Detect parallel regions", false, true)
+
+namespace llvm {
+ModulePass *createParallelIRRegionInfoPass() {
+  return new ParallelIRRegionInfoPass();
+}
+} // namespace llvm
Index: lib/Passes/LLVMBuild.txt
===================================================================
--- lib/Passes/LLVMBuild.txt
+++ lib/Passes/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Passes
 parent = Libraries
-required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation ParallelIR
Index: lib/Passes/PassBuilder.cpp
===================================================================
--- lib/Passes/PassBuilder.cpp
+++ lib/Passes/PassBuilder.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/IR/Dominators.h"
@@ -137,6 +138,7 @@
 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/Transforms/ParallelIR/AttributeAnnotator.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
Index: lib/Passes/PassRegistry.def
===================================================================
--- lib/Passes/PassRegistry.def
+++ lib/Passes/PassRegistry.def
@@ -26,6 +26,7 @@
 MODULE_ANALYSIS("profile-summary", ProfileSummaryAnalysis())
 MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
+MODULE_ANALYSIS("pir-regions", ParallelIRRegionAnalysis())
 
 #ifndef MODULE_ALIAS_ANALYSIS
 #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
@@ -76,6 +77,7 @@
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 MODULE_PASS("verify", VerifierPass())
+MODULE_PASS("pir-attribute-annotator", ParallelIRAttributeAnnotatorPass())
 #undef MODULE_PASS
 
 #ifndef CGSCC_ANALYSIS
Index: lib/Transforms/CMakeLists.txt
===================================================================
--- lib/Transforms/CMakeLists.txt
+++ lib/Transforms/CMakeLists.txt
@@ -8,3 +8,4 @@
 add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
+add_subdirectory(ParallelIR)
Index: lib/Transforms/IPO/LLVMBuild.txt
===================================================================
--- lib/Transforms/IPO/LLVMBuild.txt
+++ lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation ParallelIR
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/ParallelIR.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
@@ -469,6 +470,17 @@
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
+  if (OptLevel > 2) {
+    // Add parallel optimizations to the pass pipeline.
+    // FIXME: This should only happen if the input contains
+    //        parallel constructs as we also add canonicalization
+    //        passes that might disturb the regular pipeline.
+    // TODO: We actually only need CG SCC passes but as we rely on
+    //       function passes the old pass manager forces us to use
+    //       module passes here.
+    MPM.add(createParallelIRAttributeAnnotatorLegacyPass());
+  }
+
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
Index: lib/Transforms/LLVMBuild.txt
===================================================================
--- lib/Transforms/LLVMBuild.txt
+++ lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
+subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC ParallelIR
 
 [component_0]
 type = Group
Index: lib/Transforms/ParallelIR/AttributeAnnotator.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/ParallelIR/AttributeAnnotator.cpp
@@ -0,0 +1,388 @@
+//===- AttributeAnnotator.cpp -- Annotate attr. from/to parallel regions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Attribute annotator for parallel regions.
+//
+// This pass tries to add attributes to the instructions representing parallel
+// regions but also to the parallel regions itself, e.g., their arguments if
+// applicable.
+//
+// TODO: This should actually be a SCC pass on the call graph. However, the old
+//       pass manager doesn't allow us to use the function analyses the way we
+//       want/need to so it is a module pass for now.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/ParallelIR/AttributeAnnotator.h"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/ParallelIR.h"
+#include "llvm/Transforms/ParallelIR/Builder.h"
+
+#define DEBUG_TYPE "pir-attribute-annotator"
+
+using namespace llvm;
+
+STATISTIC(NumNoAliasArguments, "Number of no-alias parallel region arguments");
+STATISTIC(NumNoCaptureParameters,
+          "Number of no-capture parallel region parameters");
+STATISTIC(NumReadNoneParameters,
+          "Number of read-none parallel region parameters");
+STATISTIC(NumReadOnlyParameters,
+          "Number of read-only parallel region parameters");
+STATISTIC(NumWriteOnlyParameters,
+          "Number of write-only parallel region parameters");
+
+static cl::opt<bool> AnnotateAttributes("pir-annotate-attributes",
+                                        cl::desc("Annotate attributes"),
+                                        cl::Hidden, cl::init(true),
+                                        cl::ZeroOrMore);
+
+namespace {
+
+// FIXME: Helper type necessary as long as the parallel IR passes are
+//        implemented as module not SCC passes on the CG.
+template <typename T>
+using FuncResultProviderTy = std::function<T &(Function &F)>;
+
+/// Transformer that identifies merge opportunities and applies code changes.
+struct AttributeAnnotator {
+
+  /// Constructor that accepts the parallel region info and analyses providers.
+  ///
+  /// @param PRI         The parallel region info for this module.
+  /// @param AAProvider  Callback to get alias information for a function.
+  AttributeAnnotator(ParallelIRRegionInfo &PRI,
+                     FuncResultProviderTy<AliasAnalysis> &AAProvider,
+                     FuncResultProviderTy<DominatorTree> &DTProvider)
+      : PRI(PRI), AAProvider(AAProvider), DTProvider(DTProvider) {}
+
+  /// Run the attribute Annotator pass on the parallel regions in @p M.
+  ///
+  /// @param M The module to run on.
+  ///
+  /// @returns True, if any change was made, false otherwise.
+  bool runOnModule(Module &M);
+
+private:
+  /// Parameter attributes that we try to move from inside the parallel region
+  /// to the outside.
+  Attribute::AttrKind const ParameterAttributes[4] = {
+      Attribute::NoCapture, Attribute::ReadNone, Attribute::ReadOnly,
+      Attribute::WriteOnly};
+
+  std::string const ParameterAttributeNames[4] = {"no-capture", "read-none",
+                                                  "read-only", "write-only"};
+
+  /// Statistics to keep track of annotated parameters (see above).
+  Statistic *const ParameterAttributeStatistics[4] = {
+      &NumNoCaptureParameters, &NumReadNoneParameters, &NumReadOnlyParameters,
+      &NumWriteOnlyParameters};
+
+  /// Try to annotate arguments with no-alias, nocapture, etc. attributes.
+  bool annotateArgumentAttributes(ParallelRegion &PR);
+
+  /// Annotate parallel region instructions with domain knowledge.
+  bool annotateParallelRepresentation(ParallelRegion &PR);
+
+  /// The parallel region info for the module.
+  ParallelIRRegionInfo &PRI;
+
+  /// Function analyses provider callbacks.
+  ///
+  ///{
+  FuncResultProviderTy<AliasAnalysis> &AAProvider;
+  FuncResultProviderTy<DominatorTree> &DTProvider;
+  ///}
+};
+
+bool AttributeAnnotator::annotateParallelRepresentation(ParallelRegion &PR) {
+  switch (PR.getKind()) {
+  case ParallelRegion::PRK_KMPC_RT: {
+    // We know the parallel runtime library call does not throw an exception.
+    assert(isa<CallInst>(PR.getStartPoint()) &&
+           "Expected runtime call for KMPC parallel region");
+    cast<CallInst>(PR.getStartPoint()).doesNotThrow();
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AttributeAnnotator::runOnModule(Module &M) {
+  if (!AnnotateAttributes)
+    return false;
+
+  bool Changed = false;
+  for (auto &It : PRI) {
+    for (ParallelRegion *PR : It.second) {
+      Changed |= annotateParallelRepresentation(*PR);
+      Changed |= annotateArgumentAttributes(*PR);
+    }
+  }
+
+  return Changed;
+}
+
+bool AttributeAnnotator::annotateArgumentAttributes(ParallelRegion &PR) {
+  // Try to set no-alias, no-capture, etc. argument annotations if possible.
+
+  LLVM_DEBUG(dbgs() << "Try to annotate argument attributes to "
+                    << PR.getStartPoint() << "\n");
+
+  const ParallelIRCommunicationInfo &PRCI = PR.getCommunicationInfo();
+  if (!PRCI.hasAnnotatableCommunication()) {
+    LLVM_DEBUG(dbgs() << "  - Parallel region kind " << PR.getKind()
+                      << " does not support communication attributes, skip!\n");
+    return false;
+  }
+
+  SmallVector<ParallelRegion *, 2> CommunicatingParallelRegions;
+  if (!PRCI.getAllCommunicatingParallelRegions(CommunicatingParallelRegions)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "  - Communication involves an unknown user, skip for now!\n");
+    return false;
+  }
+
+  // Unrolling and inlining might have duplicated the (indirect) call sites of
+  // a outlined parallel region. These situations are not supported yet as they
+  // would require us to intersect the information from all call sites.
+  if (CommunicatingParallelRegions.size() > 1) {
+    LLVM_DEBUG(dbgs() << "  - Communication involves  "
+                      << CommunicatingParallelRegions.size()
+                      << " parallel regions, skip for now!\n");
+    return false;
+  }
+  assert(!CommunicatingParallelRegions.empty());
+  assert(CommunicatingParallelRegions.front() == &PR);
+
+  SmallVector<Value *, 32> CommunicatedValues;
+  PRCI.getCommunicatedValues(CommunicatedValues);
+
+  Instruction &PRStartInst = PR.getStartPoint();
+  DominatorTree &DT = DTProvider(*PRStartInst.getFunction());
+
+  auto *PIRBuilder = ParallelIRBuilder::Create(PRI, PR.getKind());
+
+  // Set of pointer parameters that might be alias free in the parallel region,
+  // thus no-alias arguments.
+  SmallPtrSet<Value *, 32> NoAliasCandidates;
+
+  bool Changed = false;
+  unsigned NumCommunicatedValues = CommunicatedValues.size();
+  for (unsigned Idx = 0; Idx < NumCommunicatedValues; Idx++) {
+    if (!CommunicatedValues[Idx])
+      continue;
+
+    if (isa<UndefValue>(CommunicatedValues[Idx]))
+      continue;
+
+    // For now we skip all non-pointer-type values.
+    if (!CommunicatedValues[Idx]->getType()->isPointerTy())
+      continue;
+
+    int NumAttributes =
+        sizeof(ParameterAttributes) / sizeof(ParameterAttributes[0]);
+
+    // This is for bookkeeping purposes only.
+    int NumAttributeStatistics = sizeof(ParameterAttributeStatistics) /
+                                 sizeof(ParameterAttributeStatistics[0]);
+    assert(NumAttributes <= NumAttributeStatistics &&
+           "Require at least as many attribute statistics as there are "
+           "attributes!\n");
+
+    // Propagate the known argument/parameter attributes.
+    for (int i = 0; i < NumAttributes; i++) {
+      if (!PRCI.hasAttributeInParallelRegion(Idx, ParameterAttributes[i]))
+        continue;
+
+      if (!PIRBuilder->addAttributeInSequentialRegion(PRCI, Idx,
+                                                      ParameterAttributes[i]))
+        continue;
+
+      LLVM_DEBUG({
+        int NumAttributeNames = sizeof(ParameterAttributeNames) /
+                                sizeof(ParameterAttributeNames[0]);
+        assert(NumAttributes == NumAttributeNames);
+        dbgs() << "  - Argument " << Idx << " is tagged with "
+               << ParameterAttributeNames[i] << "\n";
+      });
+
+      (*ParameterAttributeStatistics[i])++;
+      Changed = true;
+    }
+
+    // After we propagated "local" parameter attributes we proceed to check
+    // if this argument could be marked as no-alias. If it already is marked, or
+    // if it might alias with anything, we skip it. To check for the latter, we
+    // first require the argument to be identified as function local and not
+    // captured up to the point of the parallel region. Later we also verify
+    // they do not alias with other arguments to the parallel region.
+    if (PRCI.hasAttributeInParallelRegion(Idx, Attribute::NoAlias))
+      continue;
+    if (!isIdentifiedFunctionLocal(CommunicatedValues[Idx]))
+      continue;
+
+    if (PointerMayBeCapturedBefore(CommunicatedValues[Idx], false, true,
+                                   &PRStartInst, &DT))
+      continue;
+
+    NoAliasCandidates.insert(CommunicatedValues[Idx]);
+  }
+
+  if (NoAliasCandidates.empty()) {
+    LLVM_DEBUG(dbgs() << "  - Parallel region has no no-alias candidates.\n");
+    return Changed;
+  }
+
+  SmallVector<Instruction *, 8> PotentialBarriers;
+  PR.getPotentialBarriers(PotentialBarriers);
+
+  // This initial version does not support potential (and actual) barriers as
+  // the no-alias attributes would interfere with them. A way to combine
+  // no-alias attributes and potential/actual barriers is the use of operand
+  // bundles.
+  if (!PotentialBarriers.empty()) {
+    LLVM_DEBUG(dbgs() << "  - Parallel region contains "
+                      << PotentialBarriers.size()
+                      << " potential barriers, skip for now!\n");
+    return Changed;
+  }
+
+  // While all no-alias candidates do not alias with globals or pointers loaded
+  // from memory they might alias with other arguments. To check this we put
+  // them all in an alias set tracker and filter out singleton alias sets.
+  AliasAnalysis &AA = AAProvider(*PRStartInst.getFunction());
+  AliasSetTracker AST(AA);
+  AAMDNodes AATags;
+
+  for (Value *CommunicatedValue : CommunicatedValues) {
+    if (!CommunicatedValue)
+      continue;
+    AST.add(CommunicatedValue, MemoryLocation::UnknownSize, AATags);
+  }
+
+  for (unsigned Idx = 0; Idx < NumCommunicatedValues; Idx++) {
+    if (!NoAliasCandidates.count(CommunicatedValues[Idx]))
+      continue;
+
+    assert(CommunicatedValues[Idx]);
+    assert(CommunicatedValues[Idx]->getType()->isPointerTy());
+    const auto &AliasSet = AST.getAliasSetForPointer(
+        CommunicatedValues[Idx], MemoryLocation::UnknownSize, AATags);
+
+    // Check for singleton alias sets, thus pointers that do not alias.
+    if (++AliasSet.begin() != AliasSet.end())
+      continue;
+    assert(AliasSet.isMustAlias());
+
+    if (!PIRBuilder->addAttribute(PRCI, Idx, Attribute::NoAlias))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "  - Argument " << Idx << " is tagged as no-alias\n");
+
+    Changed = true;
+    NumNoAliasArguments++;
+  }
+
+  return Changed;
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses
+ParallelIRAttributeAnnotatorPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  FuncResultProviderTy<AliasAnalysis> AAProvider =
+      [&](Function &F) -> AliasAnalysis & {
+    return FAM.getResult<AAManager>(F);
+  };
+  FuncResultProviderTy<DominatorTree> DTProvider =
+      [&](Function &F) -> DominatorTree & {
+    return FAM.getResult<DominatorTreeAnalysis>(F);
+  };
+
+  ParallelIRRegionInfo &PRI = MAM.getResult<ParallelIRRegionAnalysis>(M);
+  AttributeAnnotator PRM(PRI, AAProvider, DTProvider);
+  if (PRM.runOnModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct ParallelIRAttributeAnnotatorLegacyPass : public ModulePass {
+  static char ID;
+
+  ParallelIRAttributeAnnotatorLegacyPass() : ModulePass(ID) {
+    initializeParallelIRAttributeAnnotatorLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+
+    FuncResultProviderTy<DominatorTree> DTProvider =
+        [&](Function &F) -> DominatorTree & {
+      return getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+    };
+    FuncResultProviderTy<AliasAnalysis> AAProvider =
+        [&](Function &F) -> AliasAnalysis & {
+      return getAnalysis<AAResultsWrapperPass>(F).getAAResults();
+    };
+
+    ParallelIRRegionInfo &PRI =
+        getAnalysis<ParallelIRRegionInfoPass>().getParallelIRRegionInfo();
+    AttributeAnnotator RM(PRI, AAProvider, DTProvider);
+
+    return RM.runOnModule(M);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ParallelIRRegionInfoPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<ParallelIRRegionInfoPass>();
+    AU.setPreservesCFG();
+  }
+};
+
+} // end anonymous namespace
+
+char ParallelIRAttributeAnnotatorLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ParallelIRAttributeAnnotatorLegacyPass,
+                      "pir-attribute-annotator",
+                      "Annotate attributes to parallel region", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ParallelIRRegionInfoPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(ParallelIRAttributeAnnotatorLegacyPass,
+                    "pir-attribute-annotator",
+                    "Annotate attributes to parallel region", false, false)
+
+ModulePass *llvm::createParallelIRAttributeAnnotatorLegacyPass() {
+  return new ParallelIRAttributeAnnotatorLegacyPass();
+}
Index: lib/Transforms/ParallelIR/Builder.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/ParallelIR/Builder.cpp
@@ -0,0 +1,42 @@
+//===- ParallelIR/Builder.cpp - Parallel region IR builder ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the general (and abstract) parallel IR builder interface.
+// The interface allows to manipulate parallel regions regardless of the
+// underlying representation by representation specific implementations in the
+// subclasses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/ParallelIR/Builder.h"
+
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "pir-builder"
+
+using namespace llvm;
+
+ParallelIRBuilder *createKMPCIRBuilder(ParallelIRRegionInfo &,
+                                       ParallelRegion::ParallelRegionKind PRKind);
+
+ParallelIRBuilder *
+ParallelIRBuilder::Create(ParallelIRRegionInfo &PRI,
+                          ParallelRegion::ParallelRegionKind PRKind) {
+
+  switch (PRKind) {
+  case ParallelRegion::PRK_KMPC_RT:
+  case ParallelRegion::PRK_KMPC_TASK_RT:
+  case ParallelRegion::PRK_KMPC_FORK_RT:
+    return createKMPCIRBuilder(PRI, PRKind);
+  default:
+    break;
+  }
+
+  llvm_unreachable("No builder for chosen parallel region kind available!");
+}
Index: lib/Transforms/ParallelIR/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Transforms/ParallelIR/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMParallelIROpts
+  ParallelIR.cpp
+  Builder.cpp
+  AttributeAnnotator.cpp
+  KMPCImpl.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/ParallelIR
+
+  DEPENDS
+  intrinsics_gen
+  )
Index: lib/Transforms/ParallelIR/KMPCImpl.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/ParallelIR/KMPCImpl.cpp
@@ -0,0 +1,123 @@
+//===- KMPCImpl.cpp - Parallel IR Transformation Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the parallel IR tranformation interface for parallel
+// regions represented with KMPC (OpenMP) runtime calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/ParallelIR/Builder.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ParallelIR/KMPCImpl.h"
+#include "llvm/Analysis/ParallelIR/RegionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pir-builder"
+
+//===----------------------------------------------------------------------===//
+// PIRBuilder specialization for the OpenMP KMPC runtime library.
+//
+
+struct KMPC_ParallelIRBuilder : public ParallelIRBuilder {
+
+  KMPC_ParallelIRBuilder(ParallelIRRegionInfo &PRI,
+                         ParallelRegion::ParallelRegionKind PRKind)
+      : PRI(PRI), PRKind(PRKind) {
+    assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT ||
+           PRKind == ParallelRegion::PRK_KMPC_TASK_RT);
+  }
+
+  /// Return the offset at which the first subfunction call argument is located.
+  unsigned getFirstArgumentOffset() const;
+
+  /// Return the offset at which the first subfunction parameter is located.
+  unsigned getFirstParameterOffset() const;
+
+  /// See @ParallelIRBuilder::addAttributeInSequentialRegion
+  virtual bool
+  addAttributeInSequentialRegion(const ParallelIRCommunicationInfo &PRCI,
+                                 unsigned Idx,
+                                 Attribute::AttrKind Kind) const override;
+
+  /// See @ParallelIRBuilder::addAttributeInParallelRegion
+  virtual bool
+  addAttributeInParallelRegion(const ParallelIRCommunicationInfo &PRCI,
+                               unsigned Idx,
+                               Attribute::AttrKind Kind) const override;
+
+  /// See @ParallelIRBuilder::addAttribute
+  virtual bool addAttribute(const ParallelIRCommunicationInfo &PRCI,
+                            unsigned Idx,
+                            Attribute::AttrKind Kind) const override;
+
+  /// The parallel region info pass.
+  ParallelIRRegionInfo &PRI;
+
+  /// The actual parallel region info kind this builder was created for.
+  /// There are multiple KMPC encodings (tasks/forks) that can be distinguished
+  /// this way.
+  ParallelRegion::ParallelRegionKind PRKind;
+};
+
+//===----------------------------------------------------------------------===//
+// KMPC_ParallelIRBuilder implementation
+
+ParallelIRBuilder *
+createKMPCIRBuilder(ParallelIRRegionInfo &PRI,
+                    ParallelRegion::ParallelRegionKind PRKind) {
+  return new KMPC_ParallelIRBuilder(PRI, PRKind);
+}
+
+unsigned KMPC_ParallelIRBuilder::getFirstParameterOffset() const {
+  assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT ||
+          PRKind == ParallelRegion::PRK_KMPC_TASK_RT);
+  return PRKind == ParallelRegion::PRK_KMPC_FORK_RT ? 2 : 1;
+}
+
+unsigned KMPC_ParallelIRBuilder::getFirstArgumentOffset() const {
+  assert(PRKind == ParallelRegion::PRK_KMPC_FORK_RT ||
+          PRKind == ParallelRegion::PRK_KMPC_TASK_RT);
+  return PRKind == ParallelRegion::PRK_KMPC_FORK_RT ? 3 : 2;
+}
+
+bool KMPC_ParallelIRBuilder::addAttributeInSequentialRegion(
+    const ParallelIRCommunicationInfo &PRCI, unsigned Idx,
+    Attribute::AttrKind Kind) const {
+
+  const KMPC_CommunicationInfo &KMPCCI =
+      static_cast<const KMPC_CommunicationInfo &>(PRCI);
+  CallInst &RTCall = KMPCCI.getRTCall();
+  RTCall.addParamAttr(Idx + getFirstArgumentOffset(), Kind);
+  return true;
+}
+
+bool KMPC_ParallelIRBuilder::addAttributeInParallelRegion(
+    const ParallelIRCommunicationInfo &PRCI, unsigned Idx,
+    Attribute::AttrKind Kind) const {
+  Argument *Arg =
+      cast<Argument>(PRCI.getCommunicatedValueInParallelRegion(Idx));
+  Arg->addAttr(Kind);
+  return true;
+}
+
+bool KMPC_ParallelIRBuilder::addAttribute(
+    const ParallelIRCommunicationInfo &PRCI, unsigned Idx,
+    Attribute::AttrKind Kind) const {
+  addAttributeInSequentialRegion(PRCI, Idx, Kind);
+  addAttributeInParallelRegion(PRCI, Idx, Kind);
+  return true;
+}
Index: lib/Transforms/ParallelIR/LLVMBuild.txt
===================================================================
--- lib/Transforms/ParallelIR/LLVMBuild.txt
+++ lib/Transforms/ParallelIR/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Passes/LLVMBuild.txt -------------------------------*- Conf -*--===;
+;===- ./lib/Transforms/ParallelIR/LLVMBuild.txt ----------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,6 +17,7 @@
 
 [component_0]
 type = Library
-name = Passes
-parent = Libraries
-required_libraries = AggressiveInstCombine Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation
+name = ParallelIR
+parent = Transforms
+library_name = ParallelIROpts
+required_libraries = Analysis Core InstCombine Support TransformUtils
Index: lib/Transforms/ParallelIR/ParallelIR.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/ParallelIR/ParallelIR.cpp
@@ -0,0 +1,22 @@
+//===-- ParallelIR.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/ParallelIR.h"
+
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+/// initializeParallelIROptsPasses - Initialize all passes linked into the
+/// ParallelIROpts library.
+void llvm::initializeParallelIROpts(PassRegistry &Registry) {
+  initializeParallelIRAttributeAnnotatorLegacyPassPass(Registry);
+}
Index: test/Other/opt-O3-pipeline.ll
===================================================================
--- test/Other/opt-O3-pipeline.ll
+++ test/Other/opt-O3-pipeline.ll
@@ -27,6 +27,9 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     Detect parallel regions
+; CHECK-NEXT:     Annotate attributes to parallel region
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Call-site splitting
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
Index: test/Transforms/ParallelIR/kmpc_arg_attributes.ll
===================================================================
--- /dev/null
+++ test/Transforms/ParallelIR/kmpc_arg_attributes.ll
@@ -0,0 +1,139 @@
+; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS
+; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR
+
+; PIR_REGS:      Parallel region in main [1]:
+; PIR_REGS-NEXT: Parallel Region [5]:
+; PIR_REGS-NEXT:      fork call:   call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+; PIR_REGS-NEXT:   sub-function: .omp_outlined.
+; PIR_REGS-NEXT:   communicated:   %c = alloca [100 x float], align 16 : 3
+; PIR_REGS-NEXT:   communicated:   %a = alloca [100 x float], align 16 : 1
+; PIR_REGS-NEXT:   communicated:   %b = alloca [100 x float], align 16 : 1
+
+; The PIR ATTR check lines below will verify that %a, %b, and %c are annotated
+; with noalias, nocapture, and for the first two also readonly. %c should be
+; writeonly but it has not the appropriate argument attribute.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%ident_t = type { i32, i32, i32, i32, i8* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %a = alloca [100 x float], align 16
+  %b = alloca [100 x float], align 16
+  %c = alloca [100 x float], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %i.0 to double
+  %mul = fmul double %conv, 1.000000e+00
+  %conv1 = fptrunc double %mul to float
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom
+  store float %conv1, float* %arrayidx, align 4
+  %mul2 = fmul float 2.000000e+00, %conv1
+  %idxprom3 = sext i32 %i.0 to i64
+  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom3
+  store float %mul2, float* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+; PIR_ATTR: [100 x float]* noalias nocapture %c, [100 x float]* noalias nocapture readonly %a, [100 x float]* noalias nocapture readonly %b)
+  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* %c, [100 x float]* %a, [100 x float]* %b)
+  %arrayidx5 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 50
+  %0 = load float, float* %arrayidx5, align 8
+  %conv6 = fpext float %0 to double
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv6)
+  ret i32 0
+}
+
+; PIR_ATTR: [100 x float]* noalias nocapture dereferenceable(400) %c, [100 x float]* noalias nocapture readonly dereferenceable(400) %a, [100 x float]* noalias nocapture readonly dereferenceable(400) %b) #0 {
+define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [100 x float]* nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 99, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32, i32* %.global_tid., align 4
+  call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %1 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %1, 99
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 99, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %3 = load i32, i32* %.omp.lb, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add7, %omp.inner.for.inc ]
+  %4 = load i32, i32* %.omp.ub, align 4
+  %cmp1 = icmp sle i32 %.omp.iv.0, %4
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %mul = mul nsw i32 %.omp.iv.0, 1
+  %add = add nsw i32 0, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom
+  %5 = load float, float* %arrayidx, align 4
+  %idxprom2 = sext i32 %add to i64
+  %arrayidx3 = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %add4 = fadd float %5, %6
+  %idxprom5 = sext i32 %add to i64
+  %arrayidx6 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 %idxprom5
+  store float %add4, float* %arrayidx6, align 4
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %add7 = add nsw i32 %.omp.iv.0, 1
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0)
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%ident_t*, i32)
+
+declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/ParallelIR/kmpc_arg_attributes2.ll
===================================================================
--- /dev/null
+++ test/Transforms/ParallelIR/kmpc_arg_attributes2.ll
@@ -0,0 +1,243 @@
+; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS
+; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR
+;
+; PIR_REGS:      Parallel region in main [1]:
+; PIR_REGS-NEXT: Parallel Region [5]:
+; PIR_REGS-NEXT:      fork call:   call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+; PIR_REGS-NEXT:   sub-function: .omp_outlined.
+; PIR_REGS-NEXT:   communicated:   %c = alloca [10 x float], align 16 : 3
+; PIR_REGS-NEXT:   communicated:   %a = alloca [10 x float], align 16 : 1
+; PIR_REGS-NEXT:   communicated:   %b = alloca [10 x float], align 16 : 3
+
+; The PIR ATTR check lines below will verify that %a, %b, and %c are annotated
+; with nocapture, and %a with readonly. %c should be writeonly but it has not
+; the appropriate argument attribute. 
+; Note: noalias is missing due to the potential barriers.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%ident_t = type { i32, i32, i32, i32, i8* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %a = alloca [10 x float], align 16
+  %b = alloca [10 x float], align 16
+  %c = alloca [10 x float], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %i.0 to double
+  %mul = fmul double %conv, 1.000000e+00
+  %conv1 = fptrunc double %mul to float
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom
+  store float %conv1, float* %arrayidx, align 4
+  %mul2 = fmul float 2.000000e+00, %conv1
+  %idxprom3 = sext i32 %i.0 to i64
+  %arrayidx4 = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom3
+  store float %mul2, float* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc11, %for.end
+  %i.1 = phi i32 [ 0, %for.end ], [ %inc12, %for.inc11 ]
+  %cmp6 = icmp slt i32 %i.1, 10
+  br i1 %cmp6, label %for.body8, label %for.end13
+
+for.body8:                                        ; preds = %for.cond5
+  %idxprom9 = sext i32 %i.1 to i64
+  %arrayidx10 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom9
+  store float 0.000000e+00, float* %arrayidx10, align 4
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %for.body8
+  %inc12 = add nsw i32 %i.1, 1
+  br label %for.cond5
+
+for.end13:                                        ; preds = %for.cond5
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.inc18, %for.end13
+  %i.2 = phi i32 [ 0, %for.end13 ], [ %inc19, %for.inc18 ]
+  %cmp15 = icmp slt i32 %i.2, 10
+  br i1 %cmp15, label %for.body17, label %for.end20
+
+for.body17:                                       ; preds = %for.cond14
+; PIR_ATTR: [10 x float]* nocapture %c, [10 x float]* nocapture readonly %a, [10 x float]* nocapture %b)
+  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [10 x float]*, [10 x float]*, [10 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [10 x float]* %c, [10 x float]* %a, [10 x float]* %b)
+  br label %for.inc18
+
+for.inc18:                                        ; preds = %for.body17
+  %inc19 = add nsw i32 %i.2, 1
+  br label %for.cond14
+
+for.end20:                                        ; preds = %for.cond14
+  %arrayidx21 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 5
+  %0 = load float, float* %arrayidx21, align 4
+  %conv22 = fpext float %0 to double
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv22)
+  ret i32 0
+}
+
+; Note: noalias attributes are not placed due to the (potential) barriers inside the function!
+;
+; PIR_ATTR: [10 x float]* nocapture dereferenceable(40) %c, [10 x float]* nocapture readonly dereferenceable(40) %a, [10 x float]* nocapture dereferenceable(40) %b) #0 {
+define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [10 x float]* nocapture dereferenceable(40) %c, [10 x float]* nocapture readonly dereferenceable(40) %a, [10 x float]* nocapture dereferenceable(40) %b) #0 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %.omp.lb12 = alloca i32, align 4
+  %.omp.ub13 = alloca i32, align 4
+  %.omp.stride14 = alloca i32, align 4
+  %.omp.is_last15 = alloca i32, align 4
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32, i32* %.global_tid., align 4
+  call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %1 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %1, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %3 = load i32, i32* %.omp.lb, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add9, %omp.inner.for.inc ]
+  %4 = load i32, i32* %.omp.ub, align 4
+  %cmp2 = icmp sle i32 %.omp.iv.0, %4
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %mul = mul nsw i32 %.omp.iv.0, 1
+  %add = add nsw i32 0, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom
+  %5 = load float, float* %arrayidx, align 4
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom3
+  %6 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %5, %6
+  %idxprom6 = sext i32 %add to i64
+  %arrayidx7 = getelementptr inbounds [10 x float], [10 x float]* %c, i64 0, i64 %idxprom6
+  %7 = load float, float* %arrayidx7, align 4
+  %add8 = fadd float %7, %add5
+  store float %add8, float* %arrayidx7, align 4
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %add9 = add nsw i32 %.omp.iv.0, 1
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0)
+  call void @__kmpc_barrier(%ident_t* @1, i32 %0)
+  store i32 0, i32* %.omp.lb12, align 4
+  store i32 9, i32* %.omp.ub13, align 4
+  store i32 1, i32* %.omp.stride14, align 4
+  store i32 0, i32* %.omp.is_last15, align 4
+  call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last15, i32* %.omp.lb12, i32* %.omp.ub13, i32* %.omp.stride14, i32 1, i32 1)
+  %8 = load i32, i32* %.omp.ub13, align 4
+  %cmp17 = icmp sgt i32 %8, 9
+  br i1 %cmp17, label %cond.true18, label %cond.false19
+
+cond.true18:                                      ; preds = %omp.loop.exit
+  br label %cond.end20
+
+cond.false19:                                     ; preds = %omp.loop.exit
+  %9 = load i32, i32* %.omp.ub13, align 4
+  br label %cond.end20
+
+cond.end20:                                       ; preds = %cond.false19, %cond.true18
+  %cond21 = phi i32 [ 9, %cond.true18 ], [ %9, %cond.false19 ]
+  store i32 %cond21, i32* %.omp.ub13, align 4
+  %10 = load i32, i32* %.omp.lb12, align 4
+  br label %omp.inner.for.cond22
+
+omp.inner.for.cond22:                             ; preds = %omp.inner.for.inc36, %cond.end20
+  %.omp.iv10.0 = phi i32 [ %10, %cond.end20 ], [ %add37, %omp.inner.for.inc36 ]
+  %11 = load i32, i32* %.omp.ub13, align 4
+  %cmp23 = icmp sle i32 %.omp.iv10.0, %11
+  br i1 %cmp23, label %omp.inner.for.body24, label %omp.inner.for.end38
+
+omp.inner.for.body24:                             ; preds = %omp.inner.for.cond22
+  %mul25 = mul nsw i32 %.omp.iv10.0, 1
+  %add26 = add nsw i32 0, %mul25
+  %idxprom27 = sext i32 %add26 to i64
+  %arrayidx28 = getelementptr inbounds [10 x float], [10 x float]* %a, i64 0, i64 %idxprom27
+  %12 = load float, float* %arrayidx28, align 4
+  %idxprom29 = sext i32 %add26 to i64
+  %arrayidx30 = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom29
+  %13 = load float, float* %arrayidx30, align 4
+  %add31 = fadd float %12, %13
+  %idxprom32 = sext i32 %add26 to i64
+  %arrayidx33 = getelementptr inbounds [10 x float], [10 x float]* %b, i64 0, i64 %idxprom32
+  %14 = load float, float* %arrayidx33, align 4
+  %add34 = fadd float %14, %add31
+  store float %add34, float* %arrayidx33, align 4
+  br label %omp.body.continue35
+
+omp.body.continue35:                              ; preds = %omp.inner.for.body24
+  br label %omp.inner.for.inc36
+
+omp.inner.for.inc36:                              ; preds = %omp.body.continue35
+  %add37 = add nsw i32 %.omp.iv10.0, 1
+  br label %omp.inner.for.cond22
+
+omp.inner.for.end38:                              ; preds = %omp.inner.for.cond22
+  br label %omp.loop.exit39
+
+omp.loop.exit39:                                  ; preds = %omp.inner.for.end38
+  call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0)
+  call void @__kmpc_barrier(%ident_t* @1, i32 %0)
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%ident_t*, i32)
+
+declare void @__kmpc_barrier(%ident_t*, i32)
+
+declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/Transforms/ParallelIR/kmpc_noalias_arg.ll
===================================================================
--- /dev/null
+++ test/Transforms/ParallelIR/kmpc_noalias_arg.ll
@@ -0,0 +1,145 @@
+; RUN: opt -analyze -pir-regions %s | FileCheck %s --check-prefix=PIR_REGS
+; RUN: opt -S -pir-attribute-annotator %s | FileCheck %s --check-prefix=PIR_ATTR
+
+; PIR_REGS:      Parallel region in main [1]:
+; PIR_REGS-NEXT: Parallel Region [5]:
+; PIR_REGS-NEXT:      fork call:   call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+; PIR_REGS-NEXT:   sub-function: .omp_outlined.
+; PIR_REGS-NEXT:   communicated:   %c = alloca [100 x float], align 16 : 3
+; PIR_REGS-NEXT:   communicated:   %a = alloca [100 x float], align 16 : 1
+; PIR_REGS-NEXT:   communicated:   %b = alloca [100 x float], align 16 : 1
+
+; The PIR ATTR check lines below will verify that %c is annotated as noalias but
+; not %a and %b as they can escape prior to the parallel region.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%ident_t = type { i32, i32, i32, i32, i8* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.str.1 = private unnamed_addr constant [13 x i8] c"c[N/2] = %f\0A\00", align 1
+
+@Capture = common global float* null, align 8
+declare void @capture([100 x float] *)
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %a = alloca [100 x float], align 16
+  %b = alloca [100 x float], align 16
+  %c = alloca [100 x float], align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %acast = bitcast [100 x float]* %a to float*
+  store float* %acast, float ** @Capture
+  %conv = sitofp i32 %i.0 to double
+  %mul = fmul double %conv, 1.000000e+00
+  %conv1 = fptrunc double %mul to float
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom
+  store float %conv1, float* %arrayidx, align 4
+  %mul2 = fmul float 2.000000e+00, %conv1
+  %idxprom3 = sext i32 %i.0 to i64
+  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom3
+  store float %mul2, float* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  call void @capture([100 x float]* %b)
+; PIR_ATTR: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* noalias nocapture %c, [100 x float]* nocapture readonly %a, [100 x float]* nocapture readonly %b)
+  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%ident_t* @1, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [100 x float]*, [100 x float]*, [100 x float]*)* @.omp_outlined. to void (i32*, i32*, ...)*), [100 x float]* %c, [100 x float]* %a, [100 x float]* %b)
+  call void @capture([100 x float]* %c)
+  %arrayidx5 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 50
+  %0 = load float, float* %arrayidx5, align 8
+  %conv6 = fpext float %0 to double
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), double %conv6)
+  ret i32 0
+}
+
+; PIR_ATTR: [100 x float]* noalias nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 {
+define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., [100 x float]* nocapture dereferenceable(400) %c, [100 x float]* nocapture readonly dereferenceable(400) %a, [100 x float]* nocapture readonly dereferenceable(400) %b) #0 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 99, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32, i32* %.global_tid., align 4
+  call void @__kmpc_for_static_init_4(%ident_t* @0, i32 %0, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %1 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %1, 99
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 99, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %3 = load i32, i32* %.omp.lb, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %.omp.iv.0 = phi i32 [ %3, %cond.end ], [ %add7, %omp.inner.for.inc ]
+  %4 = load i32, i32* %.omp.ub, align 4
+  %cmp1 = icmp sle i32 %.omp.iv.0, %4
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %mul = mul nsw i32 %.omp.iv.0, 1
+  %add = add nsw i32 0, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [100 x float], [100 x float]* %a, i64 0, i64 %idxprom
+  %5 = load float, float* %arrayidx, align 4
+  %idxprom2 = sext i32 %add to i64
+  %arrayidx3 = getelementptr inbounds [100 x float], [100 x float]* %b, i64 0, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %add4 = fadd float %5, %6
+  %idxprom5 = sext i32 %add to i64
+  %arrayidx6 = getelementptr inbounds [100 x float], [100 x float]* %c, i64 0, i64 %idxprom5
+  store float %add4, float* %arrayidx6, align 4
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %add7 = add nsw i32 %.omp.iv.0, 1
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%ident_t* @0, i32 %0)
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%ident_t*, i32)
+
+declare void @__kmpc_fork_call(%ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: tools/bugpoint/CMakeLists.txt
===================================================================
--- tools/bugpoint/CMakeLists.txt
+++ tools/bugpoint/CMakeLists.txt
@@ -11,6 +11,7 @@
   Instrumentation
   Linker
   ObjCARCOpts
+  ParallelIROpts
   ScalarOpts
   Support
   Target
Index: tools/bugpoint/LLVMBuild.txt
===================================================================
--- tools/bugpoint/LLVMBuild.txt
+++ tools/bugpoint/LLVMBuild.txt
@@ -29,5 +29,6 @@
  Instrumentation
  Linker
  ObjCARC
+ ParallelIR
  Scalar
  all-targets
Index: tools/opt/CMakeLists.txt
===================================================================
--- tools/opt/CMakeLists.txt
+++ tools/opt/CMakeLists.txt
@@ -12,6 +12,7 @@
   Instrumentation
   MC
   ObjCARCOpts
+  ParallelIROpts
   ScalarOpts
   Support
   Target
Index: tools/opt/LLVMBuild.txt
===================================================================
--- tools/opt/LLVMBuild.txt
+++ tools/opt/LLVMBuild.txt
@@ -27,6 +27,7 @@
  IRReader
  IPO
  Instrumentation
+ ParallelIR
  Scalar
  ObjCARC
  Passes
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -418,6 +418,7 @@
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
   initializeCoroutines(Registry);
+  initializeParallelIROpts(Registry);
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);