Index: cmake/modules/AddSphinxTarget.cmake
===================================================================
--- cmake/modules/AddSphinxTarget.cmake
+++ cmake/modules/AddSphinxTarget.cmake
@@ -39,16 +39,18 @@
     add_dependencies(sphinx ${SPHINX_TARGET_NAME})
 
     # Handle installation
-    if (builder STREQUAL man)
-      # FIXME: We might not ship all the tools that these man pages describe
-      install(DIRECTORY "${SPHINX_BUILD_DIR}/" # Slash indicates contents of
-              DESTINATION share/man/man1)
+    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+      if (builder STREQUAL man)
+        # FIXME: We might not ship all the tools that these man pages describe
+        install(DIRECTORY "${SPHINX_BUILD_DIR}/" # Slash indicates contents of
+                DESTINATION share/man/man1)
 
-    elseif (builder STREQUAL html)
-      install(DIRECTORY "${SPHINX_BUILD_DIR}"
-              DESTINATION "share/doc/${project}")
-    else()
-      message(WARNING Installation of ${builder} not supported)
+      elseif (builder STREQUAL html)
+        install(DIRECTORY "${SPHINX_BUILD_DIR}"
+                DESTINATION "share/doc/${project}")
+      else()
+        message(WARNING Installation of ${builder} not supported)
+      endif()
     endif()
   endif()
 endfunction()
Index: include/llvm/ADT/PointerUnion.h
===================================================================
--- include/llvm/ADT/PointerUnion.h
+++ include/llvm/ADT/PointerUnion.h
@@ -154,6 +154,12 @@
          "Can't get the address because PointerLikeTypeTraits changes the ptr");
       return (PT1 *)Val.getAddrOfPointer();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion &operator=(std::nullptr_t) {
+      Val.initWithPointer(nullptr);
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
@@ -298,6 +304,12 @@
       if (is<T>()) return get<T>();
       return T();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion3 &operator=(std::nullptr_t) {
+      Val = nullptr;
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
@@ -407,6 +419,12 @@
       if (is<T>()) return get<T>();
       return T();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion4 &operator=(std::nullptr_t) {
+      Val = nullptr;
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
Index: include/llvm/ADT/iterator.h
===================================================================
--- include/llvm/ADT/iterator.h
+++ include/llvm/ADT/iterator.h
@@ -97,14 +97,19 @@
 /// This class can be used through CRTP to adapt one iterator into another.
 /// Typically this is done through providing in the derived class a custom \c
 /// operator* implementation. Other methods can be overridden as well.
-template <typename DerivedT, typename WrappedIteratorT, typename T,
-          typename PointerT = T *, typename ReferenceT = T &,
-          // Don't provide these, they are mostly to act as aliases below.
-          typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
+template <
+    typename DerivedT, typename WrappedIteratorT,
+    typename IteratorCategoryT =
+        typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+    typename T = typename std::iterator_traits<WrappedIteratorT>::value_type,
+    typename DifferenceTypeT =
+        typename std::iterator_traits<WrappedIteratorT>::difference_type,
+    typename PointerT = T *, typename ReferenceT = T &,
+    // Don't provide these, they are mostly to act as aliases below.
+    typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
 class iterator_adaptor_base
-    : public iterator_facade_base<
-          DerivedT, typename WrappedTraitsT::iterator_category, T,
-          typename WrappedTraitsT::difference_type, PointerT, ReferenceT> {
+    : public iterator_facade_base<DerivedT, IteratorCategoryT, T,
+                                  DifferenceTypeT, PointerT, ReferenceT> {
   typedef typename iterator_adaptor_base::iterator_facade_base BaseT;
 
 protected:
@@ -123,7 +128,7 @@
       : I(std::forward<U &&>(u)) {}
 
 public:
-  typedef typename WrappedTraitsT::difference_type difference_type;
+  typedef DifferenceTypeT difference_type;
 
   DerivedT &operator+=(difference_type n) {
     I += n;
@@ -168,8 +173,10 @@
           typename T = typename std::remove_reference<
               decltype(**std::declval<WrappedIteratorT>())>::type>
 struct pointee_iterator
-    : iterator_adaptor_base<pointee_iterator<WrappedIteratorT>,
-                            WrappedIteratorT, T> {
+    : iterator_adaptor_base<
+          pointee_iterator<WrappedIteratorT>, WrappedIteratorT,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+          T> {
   pointee_iterator() {}
   template <typename U>
   pointee_iterator(U &&u)
Index: include/llvm/Analysis/BlockFrequencyInfoImpl.h
===================================================================
--- include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // Shared implementation of BlockFrequency for IR and Machine Instructions.
+// See the documentation below for BlockFrequencyInfoImpl for details.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,6 +17,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/BlockFrequency.h"
@@ -896,6 +898,13 @@
 class MachineLoop;
 class MachineLoopInfo;
 
+namespace bfi_detail {
+struct IrreducibleGraph;
+
+// This is part of a workaround for a GCC 4.7 crash on lambdas.
+template <class BT> struct BlockEdgesAdder;
+}
+
 /// \brief Base class for BlockFrequencyInfoImpl
 ///
 /// BlockFrequencyInfoImplBase has supporting data structures and some
@@ -948,6 +957,7 @@
     typedef SmallVector<BlockNode, 4> NodeList;
     LoopData *Parent;       ///< The parent loop.
     bool IsPackaged;        ///< Whether this has been packaged.
+    uint32_t NumHeaders;    ///< Number of headers.
     ExitMap Exits;          ///< Successor edges (and weights).
     NodeList Nodes;         ///< Header and the members of the loop.
     BlockMass BackedgeMass; ///< Mass returned to loop header.
@@ -955,11 +965,26 @@
     Float Scale;
 
     LoopData(LoopData *Parent, const BlockNode &Header)
-        : Parent(Parent), IsPackaged(false), Nodes(1, Header) {}
-    bool isHeader(const BlockNode &Node) const { return Node == Nodes[0]; }
+        : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
+    template <class It1, class It2>
+    LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
+             It2 LastOther)
+        : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
+      NumHeaders = Nodes.size();
+      Nodes.insert(Nodes.end(), FirstOther, LastOther);
+    }
+    bool isHeader(const BlockNode &Node) const {
+      if (isIrreducible())
+        return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
+                                  Node);
+      return Node == Nodes[0];
+    }
     BlockNode getHeader() const { return Nodes[0]; }
+    bool isIrreducible() const { return NumHeaders > 1; }
 
-    NodeList::const_iterator members_begin() const { return Nodes.begin() + 1; }
+    NodeList::const_iterator members_begin() const {
+      return Nodes.begin() + NumHeaders;
+    }
     NodeList::const_iterator members_end() const { return Nodes.end(); }
     iterator_range<NodeList::const_iterator> members() const {
       return make_range(members_begin(), members_end());
@@ -975,9 +1000,17 @@
     WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {}
 
     bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
+    bool isDoubleLoopHeader() const {
+      return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
+             Loop->Parent->isHeader(Node);
+    }
 
     LoopData *getContainingLoop() const {
-      return isLoopHeader() ? Loop->Parent : Loop;
+      if (!isLoopHeader())
+        return Loop;
+      if (!isDoubleLoopHeader())
+        return Loop->Parent;
+      return Loop->Parent->Parent;
     }
 
     /// \brief Resolve a node to its representative.
@@ -1011,12 +1044,22 @@
     /// Get appropriate mass for Node.  If Node is a loop-header (whose loop
     /// has been packaged), returns the mass of its pseudo-node.  If it's a
     /// node inside a packaged loop, it returns the loop's mass.
-    BlockMass &getMass() { return isAPackage() ? Loop->Mass : Mass; }
+    BlockMass &getMass() {
+      if (!isAPackage())
+        return Mass;
+      if (!isADoublePackage())
+        return Loop->Mass;
+      return Loop->Parent->Mass;
+    }
 
     /// \brief Has ContainingLoop been packaged up?
     bool isPackaged() const { return getResolvedNode() != Node; }
     /// \brief Has Loop been packaged up?
     bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
+    /// \brief Has Loop been packaged up twice?
+    bool isADoublePackage() const {
+      return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
+    }
   };
 
   /// \brief Unscaled probability weight.
@@ -1093,7 +1136,9 @@
   ///
   /// Adds all edges from LocalLoopHead to Dist.  Calls addToDist() to add each
   /// successor edge.
-  void addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
+  ///
+  /// \return \c true unless there's an irreducible backedge.
+  bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
                                Distribution &Dist);
 
   /// \brief Add an edge to the distribution.
@@ -1101,7 +1146,9 @@
   /// Adds an edge to Succ to Dist.  If \c LoopHead.isValid(), then whether the
   /// edge is local/exit/backedge is in the context of LoopHead.  Otherwise,
   /// every edge should be a local edge (since all the loops are packaged up).
-  void addToDist(Distribution &Dist, const LoopData *OuterLoop,
+  ///
+  /// \return \c true unless aborted due to an irreducible backedge.
+  bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
                  const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
 
   LoopData &getLoopPackage(const BlockNode &Head) {
@@ -1110,6 +1157,25 @@
     return *Working[Head.Index].Loop;
   }
 
+  /// \brief Analyze irreducible SCCs.
+  ///
+  /// Separate irreducible SCCs from \c G, which is an explict graph of \c
+  /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
+  /// Insert them into \a Loops before \c Insert.
+  ///
+  /// \return the \c LoopData nodes representing the irreducible SCCs.
+  iterator_range<std::list<LoopData>::iterator>
+  analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
+                     std::list<LoopData>::iterator Insert);
+
+  /// \brief Update a loop after packaging irreducible SCCs inside of it.
+  ///
+  /// Update \c OuterLoop.  Before finding irreducible control flow, it was
+  /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
+  /// LoopData::BackedgeMass need to be reset.  Also, nodes that were packaged
+  /// up need to be removed from \a OuterLoop::Nodes.
+  void updateLoopWithIrreducible(LoopData &OuterLoop);
+
   /// \brief Distribute mass according to a distribution.
   ///
   /// Distributes the mass in Source according to Dist.  If LoopHead.isValid(),
@@ -1138,6 +1204,7 @@
   void clear();
 
   virtual std::string getBlockName(const BlockNode &Node) const;
+  std::string getLoopName(const LoopData &Loop) const;
 
   virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
   void dump() const { print(dbgs()); }
@@ -1197,6 +1264,106 @@
   assert(BB && "Unexpected nullptr");
   return BB->getName().str();
 }
+
+/// \brief Graph of irreducible control flow.
+///
+/// This graph is used for determining the SCCs in a loop (or top-level
+/// function) that has irreducible control flow.
+///
+/// During the block frequency algorithm, the local graphs are defined in a
+/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
+/// graphs for most edges, but getting others from \a LoopData::ExitMap.  The
+/// latter only has successor information.
+///
+/// \a IrreducibleGraph makes this graph explicit.  It's in a form that can use
+/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
+/// and it explicitly lists predecessors and successors.  The initialization
+/// that relies on \c MachineBasicBlock is defined in the header.
+struct IrreducibleGraph {
+  typedef BlockFrequencyInfoImplBase BFIBase;
+
+  BFIBase &BFI;
+
+  typedef BFIBase::BlockNode BlockNode;
+  struct IrrNode {
+    BlockNode Node;
+    unsigned NumIn;
+    std::deque<const IrrNode *> Edges;
+    IrrNode(const BlockNode &Node) : Node(Node), NumIn(0) {}
+
+    typedef std::deque<const IrrNode *>::const_iterator iterator;
+    iterator pred_begin() const { return Edges.begin(); }
+    iterator succ_begin() const { return Edges.begin() + NumIn; }
+    iterator pred_end() const { return succ_begin(); }
+    iterator succ_end() const { return Edges.end(); }
+  };
+  BlockNode Start;
+  const IrrNode *StartIrr;
+  std::vector<IrrNode> Nodes;
+  SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
+
+  /// \brief Construct an explicit graph containing irreducible control flow.
+  ///
+  /// Construct an explicit graph of the control flow in \c OuterLoop (or the
+  /// top-level function, if \c OuterLoop is \c nullptr).  Uses \c
+  /// addBlockEdges to add block successors that have not been packaged into
+  /// loops.
+  ///
+  /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
+  /// user of this.
+  template <class BlockEdgesAdder>
+  IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
+                   BlockEdgesAdder addBlockEdges)
+      : BFI(BFI), StartIrr(nullptr) {
+    initialize(OuterLoop, addBlockEdges);
+  }
+
+  template <class BlockEdgesAdder>
+  void initialize(const BFIBase::LoopData *OuterLoop,
+                  BlockEdgesAdder addBlockEdges);
+  void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
+  void addNodesInFunction();
+  void addNode(const BlockNode &Node) {
+    Nodes.emplace_back(Node);
+    BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
+  }
+  void indexNodes();
+  template <class BlockEdgesAdder>
+  void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
+                BlockEdgesAdder addBlockEdges);
+  void addEdge(IrrNode &Irr, const BlockNode &Succ,
+               const BFIBase::LoopData *OuterLoop);
+};
+template <class BlockEdgesAdder>
+void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
+                                  BlockEdgesAdder addBlockEdges) {
+  if (OuterLoop) {
+    addNodesInLoop(*OuterLoop);
+    for (auto N : OuterLoop->Nodes)
+      addEdges(N, OuterLoop, addBlockEdges);
+  } else {
+    addNodesInFunction();
+    for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
+      addEdges(Index, OuterLoop, addBlockEdges);
+  }
+  StartIrr = Lookup[Start.Index];
+}
+template <class BlockEdgesAdder>
+void IrreducibleGraph::addEdges(const BlockNode &Node,
+                                const BFIBase::LoopData *OuterLoop,
+                                BlockEdgesAdder addBlockEdges) {
+  auto L = Lookup.find(Node.Index);
+  if (L == Lookup.end())
+    return;
+  IrrNode &Irr = *L->second;
+  const auto &Working = BFI.Working[Node.Index];
+
+  if (Working.isAPackage())
+    for (const auto &I : Working.Loop->Exits)
+      addEdge(Irr, I.first, OuterLoop);
+  else
+    addBlockEdges(*this, Irr, OuterLoop);
+}
 }
 
 /// \brief Shared implementation for block frequency analysis.
@@ -1205,6 +1372,22 @@
 /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
 /// blocks.
 ///
+/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
+/// which is called the header.  A given loop, L, can have sub-loops, which are
+/// loops within the subgraph of L that exclude its header.  (A "trivial" SCC
+/// consists of a single block that does not have a self-edge.)
+///
+/// In addition to loops, this algorithm has limited support for irreducible
+/// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
+/// discovered on they fly, and modelled as loops with multiple headers.
+///
+/// The headers of irreducible sub-SCCs consist of its entry blocks and all
+/// nodes that are targets of a backedge within it (excluding backedges within
+/// true sub-loops).  Block frequency calculations act as if a block is
+/// inserted that intercepts all the edges to the headers.  All backedges and
+/// entries point to this block.  Its successors are the headers, which split
+/// the frequency evenly.
+///
 /// This algorithm leverages BlockMass and UnsignedFloat to maintain precision,
 /// separates mass distribution from loop scaling, and dithers to eliminate
 /// probability mass loss.
@@ -1228,7 +1411,7 @@
 ///     All other stages make use of this ordering.  Save a lookup from BlockT
 ///     to BlockNode (the index into RPOT) in Nodes.
 ///
-///  1. Loop indexing (\a initializeLoops()).
+///  1. Loop initialization (\a initializeLoops()).
 ///
 ///     Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
 ///     the algorithm.  In particular, store the immediate members of each loop
@@ -1239,11 +1422,9 @@
 ///     For each loop (bottom-up), distribute mass through the DAG resulting
 ///     from ignoring backedges and treating sub-loops as a single pseudo-node.
 ///     Track the backedge mass distributed to the loop header, and use it to
-///     calculate the loop scale (number of loop iterations).
-///
-///     Visiting loops bottom-up is a post-order traversal of loop headers.
-///     For each loop, immediate members that represent sub-loops will already
-///     have been visited and packaged into a pseudo-node.
+///     calculate the loop scale (number of loop iterations).  Immediate
+///     members that represent sub-loops will already have been visited and
+///     packaged into a pseudo-node.
 ///
 ///     Distributing mass in a loop is a reverse-post-order traversal through
 ///     the loop.  Start by assigning full mass to the Loop header.  For each
@@ -1260,6 +1441,11 @@
 ///           The weight, the successor, and its category are stored in \a
 ///           Distribution.  There can be multiple edges to each successor.
 ///
+///         - If there's a backedge to a non-header, there's an irreducible SCC.
+///           The usual flow is temporarily aborted.  \a
+///           computeIrreducibleMass() finds the irreducible SCCs within the
+///           loop, packages them up, and restarts the flow.
+///
 ///         - Normalize the distribution:  scale weights down so that their sum
 ///           is 32-bits, and coalesce multiple edges to the same node.
 ///
@@ -1274,39 +1460,62 @@
 ///     loops in the function.  This uses the same algorithm as distributing
 ///     mass in a loop, except that there are no exit or backedge edges.
 ///
-///  4. Loop unpackaging and cleanup (\a finalizeMetrics()).
+///  4. Unpackage loops (\a unwrapLoops()).
+///
+///     Initialize each block's frequency to a floating point representation of
+///     its mass.
 ///
-///     Initialize the frequency to a floating point representation of its
-///     mass.
+///     Visit loops top-down, scaling the frequencies of its immediate members
+///     by the loop's pseudo-node's frequency.
 ///
-///     Visit loops top-down (reverse post-order), scaling the loop header's
-///     frequency by its psuedo-node's mass and loop scale.  Keep track of the
-///     minimum and maximum final frequencies.
+///  5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
 ///
 ///     Using the min and max frequencies as a guide, translate floating point
 ///     frequencies to an appropriate range in uint64_t.
 ///
 /// It has some known flaws.
 ///
-///   - Irreducible control flow isn't modelled correctly.  In particular,
-///     LoopInfo and MachineLoopInfo ignore irreducible backedges.  The main
-///     result is that irreducible SCCs will under-scaled.  No mass is lost,
-///     but the computed branch weights for the loop pseudo-node will be
-///     incorrect.
+///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
+///     BlockFrequency's 64-bit integer precision.
+///
+///   - The model of irreducible control flow is a rough approximation.
 ///
 ///     Modelling irreducible control flow exactly involves setting up and
 ///     solving a group of infinite geometric series.  Such precision is
 ///     unlikely to be worthwhile, since most of our algorithms give up on
 ///     irreducible control flow anyway.
 ///
-///     Nevertheless, we might find that we need to get closer.  If
-///     LoopInfo/MachineLoopInfo flags loops with irreducible control flow
-///     (and/or the function as a whole), we can find the SCCs, compute an
-///     approximate exit frequency for the SCC as a whole, and scale up
-///     accordingly.
+///     Nevertheless, we might find that we need to get closer.  Here's a sort
+///     of TODO list for the model with diminishing returns, to be completed as
+///     necessary.
 ///
-///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
-///     BlockFrequency's 64-bit integer precision.
+///       - The headers for the \a LoopData representing an irreducible SCC
+///         include non-entry blocks.  When these extra blocks exist, they
+///         indicate a self-contained irreducible sub-SCC.  We could treat them
+///         as sub-loops, rather than arbitrarily shoving the problematic
+///         blocks into the headers of the main irreducible SCC.
+///
+///       - Backedge frequencies are assumed to be evenly split between the
+///         headers of a given irreducible SCC.  Instead, we could track the
+///         backedge mass separately for each header, and adjust their relative
+///         frequencies.
+///
+///       - Entry frequencies are assumed to be evenly split between the
+///         headers of a given irreducible SCC, which is the only option if we
+///         need to compute mass in the SCC before its parent loop.  Instead,
+///         we could partially compute mass in the parent loop, and stop when
+///         we get to the SCC.  Here, we have the correct ratio of entry
+///         masses, which we can use to adjust their relative frequencies.
+///         Compute mass in the SCC, and then continue propagation in the
+///         parent.
+///
+///       - We can propagate mass iteratively through the SCC, for some fixed
+///         number of iterations.  Each iteration starts by assigning the entry
+///         blocks their backedge mass from the prior iteration.  The final
+///         mass for each block (and each exit, and the total backedge mass
+///         used for computing loop scale) is the sum of all iterations.
+///         (Running this until fixed point would "solve" the geometric
+///         series by simulation.)
 template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
   typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
   typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
@@ -1315,6 +1524,9 @@
   typedef typename bfi_detail::TypeMap<BT>::LoopT LoopT;
   typedef typename bfi_detail::TypeMap<BT>::LoopInfoT LoopInfoT;
 
+  // This is part of a workaround for a GCC 4.7 crash on lambdas.
+  friend struct bfi_detail::BlockEdgesAdder<BT>;
+
   typedef GraphTraits<const BlockT *> Successor;
   typedef GraphTraits<Inverse<const BlockT *>> Predecessor;
 
@@ -1361,7 +1573,9 @@
   ///
   /// In the context of distributing mass through \c OuterLoop, divide the mass
   /// currently assigned to \c Node between its successors.
-  void propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
+  ///
+  /// \return \c true unless there's an irreducible backedge.
+  bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
 
   /// \brief Compute mass in a particular loop.
   ///
@@ -1370,20 +1584,51 @@
   /// that have not been packaged into sub-loops.
   ///
   /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
-  void computeMassInLoop(LoopData &Loop);
+  /// \return \c true unless there's an irreducible backedge.
+  bool computeMassInLoop(LoopData &Loop);
+
+  /// \brief Try to compute mass in the top-level function.
+  ///
+  /// Assign mass to the entry block, and then for each block in reverse
+  /// post-order, distribute mass to its successors.  Skips nodes that have
+  /// been packaged into loops.
+  ///
+  /// \pre \a computeMassInLoops() has been called.
+  /// \return \c true unless there's an irreducible backedge.
+  bool tryToComputeMassInFunction();
+
+  /// \brief Compute mass in (and package up) irreducible SCCs.
+  ///
+  /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
+  /// of \c Insert), and call \a computeMassInLoop() on each of them.
+  ///
+  /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
+  ///
+  /// \pre \a computeMassInLoop() has been called for each subloop of \c
+  /// OuterLoop.
+  /// \pre \c Insert points at the the last loop successfully processed by \a
+  /// computeMassInLoop().
+  /// \pre \c OuterLoop has irreducible SCCs.
+  void computeIrreducibleMass(LoopData *OuterLoop,
+                              std::list<LoopData>::iterator Insert);
 
   /// \brief Compute mass in all loops.
   ///
   /// For each loop bottom-up, call \a computeMassInLoop().
+  ///
+  /// \a computeMassInLoop() aborts (and returns \c false) on loops that
+  /// contain a irreducible sub-SCCs.  Use \a computeIrreducibleMass() and then
+  /// re-enter \a computeMassInLoop().
+  ///
+  /// \post \a computeMassInLoop() has returned \c true for every loop.
   void computeMassInLoops();
 
   /// \brief Compute mass in the top-level function.
   ///
-  /// Assign mass to the entry block, and then for each block in reverse
-  /// post-order, distribute mass to its successors.  Skips nodes that have
-  /// been packaged into loops.
+  /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
+  /// compute mass in the top-level function.
   ///
-  /// \pre \a computeMassInLoops() has been called.
+  /// \post \a tryToComputeMassInFunction() has returned \c true.
   void computeMassInFunction();
 
   std::string getBlockName(const BlockNode &Node) const override {
@@ -1530,27 +1775,50 @@
 
 template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
   // Visit loops with the deepest first, and the top-level loops last.
-  for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L)
-    computeMassInLoop(*L);
+  for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
+    if (computeMassInLoop(*L))
+      continue;
+    auto Next = std::next(L);
+    computeIrreducibleMass(&*L, L.base());
+    L = std::prev(Next);
+    if (computeMassInLoop(*L))
+      continue;
+    llvm_unreachable("unhandled irreducible control flow");
+  }
 }
 
 template <class BT>
-void BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
+bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
   // Compute mass in loop.
-  DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(Loop.getHeader())
-               << "\n");
-
-  Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
-  propagateMassToSuccessors(&Loop, Loop.getHeader());
-
-  for (const BlockNode &M : Loop.members())
-    propagateMassToSuccessors(&Loop, M);
+  DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
+
+  if (Loop.isIrreducible()) {
+    BlockMass Remaining = BlockMass::getFull();
+    for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+      auto &Mass = Working[Loop.Nodes[H].Index].getMass();
+      Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
+      Remaining -= Mass;
+    }
+    for (const BlockNode &M : Loop.Nodes)
+      if (!propagateMassToSuccessors(&Loop, M))
+        llvm_unreachable("unhandled irreducible control flow");
+  } else {
+    Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
+    if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
+      llvm_unreachable("irreducible control flow to loop header!?");
+    for (const BlockNode &M : Loop.members())
+      if (!propagateMassToSuccessors(&Loop, M))
+        // Irreducible backedge.
+        return false;
+  }
 
   computeLoopScale(Loop);
   packageLoop(Loop);
+  return true;
 }
 
-template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
+template <class BT>
+bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
   // Compute mass in function.
   DEBUG(dbgs() << "compute-mass-in-function\n");
   assert(!Working.empty() && "no blocks in function");
@@ -1563,12 +1831,63 @@
     if (Working[Node.Index].isPackaged())
       continue;
 
-    propagateMassToSuccessors(nullptr, Node);
+    if (!propagateMassToSuccessors(nullptr, Node))
+      return false;
+  }
+  return true;
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
+  if (tryToComputeMassInFunction())
+    return;
+  computeIrreducibleMass(nullptr, Loops.begin());
+  if (tryToComputeMassInFunction())
+    return;
+  llvm_unreachable("unhandled irreducible control flow");
+}
+
+/// \note This should be a lambda, but that crashes GCC 4.7.
+namespace bfi_detail {
+template <class BT> struct BlockEdgesAdder {
+  typedef BT BlockT;
+  typedef BlockFrequencyInfoImplBase::LoopData LoopData;
+  typedef GraphTraits<const BlockT *> Successor;
+
+  const BlockFrequencyInfoImpl<BT> &BFI;
+  explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
+      : BFI(BFI) {}
+  void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
+                  const LoopData *OuterLoop) {
+    const BlockT *BB = BFI.RPOT[Irr.Node.Index];
+    for (auto I = Successor::child_begin(BB), E = Successor::child_end(BB);
+         I != E; ++I)
+      G.addEdge(Irr, BFI.getNode(*I), OuterLoop);
   }
+};
+}
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
+    LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
+  DEBUG(dbgs() << "analyze-irreducible-in-";
+        if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
+        else dbgs() << "function\n");
+
+  using namespace bfi_detail;
+  // Ideally, addBlockEdges() would be declared here as a lambda, but that
+  // crashes GCC 4.7.
+  BlockEdgesAdder<BT> addBlockEdges(*this);
+  IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
+
+  for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
+    computeMassInLoop(L);
+
+  if (!OuterLoop)
+    return;
+  updateLoopWithIrreducible(*OuterLoop);
 }
 
 template <class BT>
-void
+bool
 BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
                                                       const BlockNode &Node) {
   DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
@@ -1576,20 +1895,25 @@
   Distribution Dist;
   if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
     assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
-    addLoopSuccessorsToDist(OuterLoop, *Loop, Dist);
+    if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
+      // Irreducible backedge.
+      return false;
   } else {
     const BlockT *BB = getBlock(Node);
     for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
          SI != SE; ++SI)
       // Do not dereference SI, or getEdgeWeight() is linear in the number of
       // successors.
-      addToDist(Dist, OuterLoop, Node, getNode(*SI),
-                BPI->getEdgeWeight(BB, SI));
+      if (!addToDist(Dist, OuterLoop, Node, getNode(*SI),
+                     BPI->getEdgeWeight(BB, SI)))
+        // Irreducible backedge.
+        return false;
   }
 
   // Distribute mass to successors, saving exit and backedge data in the
   // loop header.
   distributeMass(Node, OuterLoop, Dist);
+  return true;
 }
 
 template <class BT>
Index: include/llvm/Analysis/LazyCallGraph.h
===================================================================
--- include/llvm/Analysis/LazyCallGraph.h
+++ include/llvm/Analysis/LazyCallGraph.h
@@ -113,8 +113,9 @@
   /// be scanned for "calls" or uses of functions and its child information
   /// will be constructed. All of these results are accumulated and cached in
   /// the graph.
-  class iterator : public iterator_adaptor_base<
-                       iterator, NodeVectorImplT::iterator, Node> {
+  class iterator
+      : public iterator_adaptor_base<iterator, NodeVectorImplT::iterator,
+                                     std::random_access_iterator_tag, Node> {
     friend class LazyCallGraph;
     friend class LazyCallGraph::Node;
 
Index: include/llvm/IR/User.h
===================================================================
--- include/llvm/IR/User.h
+++ include/llvm/IR/User.h
@@ -131,8 +131,9 @@
   /// Convenience iterator for directly iterating over the Values in the
   /// OperandList
   struct value_op_iterator
-      : iterator_adaptor_base<value_op_iterator, op_iterator, Value *, Value *,
-                              Value *> {
+      : iterator_adaptor_base<value_op_iterator, op_iterator,
+                              std::random_access_iterator_tag, Value *,
+                              ptrdiff_t, Value *, Value *> {
     explicit value_op_iterator(Use *U = nullptr) : iterator_adaptor_base(U) {}
 
     Value *operator*() const { return *I; }
Index: include/llvm/MC/MCExpr.h
===================================================================
--- include/llvm/MC/MCExpr.h
+++ include/llvm/MC/MCExpr.h
@@ -53,8 +53,9 @@
 
   bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                  const MCAsmLayout *Layout,
-                                 const SectionAddrMap *Addrs,
-                                 bool InSet) const;
+                                 const SectionAddrMap *Addrs, bool InSet,
+                                 bool ForceVarExpansion) const;
+
 public:
   /// @name Accessors
   /// @{
@@ -93,6 +94,14 @@
   /// @result - True on success.
   bool EvaluateAsRelocatable(MCValue &Res, const MCAsmLayout *Layout) const;
 
+  /// \brief Try to evaluate the expression to the form (a - b + constant) where
+  /// neither a nor b are variables.
+  ///
+  /// This is a more aggressive variant of EvaluateAsRelocatable. The intended
+  /// use is for when relocations are not available, like the symbol value in
+  /// the symbol table.
+  bool EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const;
+
   /// FindAssociatedSection - Find the "associated section" for this expression,
   /// which is currently defined as the absolute section for constants, or
   /// otherwise the section associated with the first defined symbol in the
Index: lib/Analysis/BlockFrequencyInfoImpl.cpp
===================================================================
--- lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -17,6 +17,7 @@
 #include <deque>
 
 using namespace llvm;
+using namespace llvm::bfi_detail;
 
 #define DEBUG_TYPE "block-freq"
 
@@ -568,7 +569,7 @@
   BFI.Freqs = std::move(SavedFreqs);
 }
 
-void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
+bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
                                            const LoopData *OuterLoop,
                                            const BlockNode &Pred,
                                            const BlockNode &Succ,
@@ -598,34 +599,48 @@
   if (isLoopHeader(Resolved)) {
     DEBUG(debugSuccessor("backedge"));
     Dist.addBackedge(OuterLoop->getHeader(), Weight);
-    return;
+    return true;
   }
 
   if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
     DEBUG(debugSuccessor("  exit  "));
     Dist.addExit(Resolved, Weight);
-    return;
+    return true;
   }
 
   if (Resolved < Pred) {
-    // Irreducible backedge.  Skip.
-    DEBUG(debugSuccessor("  skip  "));
-    return;
+    if (!isLoopHeader(Pred)) {
+      // If OuterLoop is an irreducible loop, we can't actually handle this.
+      assert((!OuterLoop || !OuterLoop->isIrreducible()) &&
+             "unhandled irreducible control flow");
+
+      // Irreducible backedge.  Abort.
+      DEBUG(debugSuccessor("abort!!!"));
+      return false;
+    }
+
+    // If "Pred" is a loop header, then this isn't really a backedge; rather,
+    // OuterLoop must be irreducible.  These false backedges can come only from
+    // secondary loop headers.
+    assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) &&
+           "unhandled irreducible control flow");
   }
 
   DEBUG(debugSuccessor(" local  "));
   Dist.addLocal(Resolved, Weight);
+  return true;
 }
 
-void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
+bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
     const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) {
   // Copy the exit map into Dist.
   for (const auto &I : Loop.Exits)
-    addToDist(Dist, OuterLoop, Loop.getHeader(), I.first, I.second.getMass());
+    if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first,
+                   I.second.getMass()))
+      // Irreducible backedge.
+      return false;
 
-  // We don't need this map any more.  Clear it to prevent quadratic memory
-  // usage in deeply nested loops with irreducible control flow.
-  Loop.Exits.clear();
+  return true;
 }
 
 /// \brief Get the maximum allowed loop scale.
@@ -637,8 +652,7 @@
 /// \brief Compute the loop scale for a loop.
 void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
   // Compute loop scale.
-  DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(Loop.getHeader())
-               << "\n");
+  DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
 
   // LoopScale == 1 / ExitMass
   // ExitMass == HeadMass - BackedgeMass
@@ -659,12 +673,15 @@
 
 /// \brief Package up a loop.
 void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
-  DEBUG(dbgs() << "packaging-loop: " << getBlockName(Loop.getHeader()) << "\n");
+  DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
+
+  // Clear the subloop exits to prevent quadratic memory usage.
+  for (const BlockNode &M : Loop.Nodes) {
+    if (auto *Loop = Working[M.Index].getPackagedLoop())
+      Loop->Exits.clear();
+    DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
+  }
   Loop.IsPackaged = true;
-  DEBUG(for (const BlockNode &M
-             : Loop.members()) {
-               dbgs() << " - node: " << getBlockName(M.Index) << "\n";
-             });
 }
 
 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
@@ -745,7 +762,7 @@
 /// Visits all the members of a loop, adjusting their BlockData according to
 /// the loop's pseudo-node.
 static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
-  DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Loop.getHeader())
+  DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
                << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
                << "\n");
   Loop.Scale *= Loop.Mass.toFloat();
@@ -757,7 +774,7 @@
   // final head scale will be used for updated the rest of the members.
   for (const BlockNode &N : Loop.Nodes) {
     const auto &Working = BFI.Working[N.Index];
-    Float &F = Working.isAPackage() ? BFI.getLoopPackage(N).Scale
+    Float &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
                                     : BFI.Freqs[N.Index].Floating;
     Float New = Loop.Scale * F;
     DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
@@ -813,6 +830,10 @@
 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
   return std::string();
 }
+std::string
+BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
+  return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
+}
 
 raw_ostream &
 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
@@ -828,3 +849,172 @@
 
   return OS << Block / Entry;
 }
+
+void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
+  Start = OuterLoop.getHeader();
+  Nodes.reserve(OuterLoop.Nodes.size());
+  for (auto N : OuterLoop.Nodes)
+    addNode(N);
+  indexNodes();
+}
+void IrreducibleGraph::addNodesInFunction() {
+  Start = 0;
+  for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
+    if (!BFI.Working[Index].isPackaged())
+      addNode(Index);
+  indexNodes();
+}
+void IrreducibleGraph::indexNodes() {
+  for (auto &I : Nodes)
+    Lookup[I.Node.Index] = &I;
+}
+void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,
+                               const BFIBase::LoopData *OuterLoop) {
+  if (OuterLoop && OuterLoop->isHeader(Succ))
+    return;
+  auto L = Lookup.find(Succ.Index);
+  if (L == Lookup.end())
+    return;
+  IrrNode &SuccIrr = *L->second;
+  Irr.Edges.push_back(&SuccIrr);
+  SuccIrr.Edges.push_front(&Irr);
+  ++SuccIrr.NumIn;
+}
+
+namespace llvm {
+template <> struct GraphTraits<IrreducibleGraph> {
+  typedef bfi_detail::IrreducibleGraph GraphT;
+
+  typedef const GraphT::IrrNode NodeType;
+  typedef GraphT::IrrNode::iterator ChildIteratorType;
+
+  static const NodeType *getEntryNode(const GraphT &G) {
+    return G.StartIrr;
+  }
+  static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); }
+  static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); }
+};
+}
+
+/// \brief Find extra irreducible headers.
+///
+/// Find entry blocks and other blocks with backedges, which exist when \c G
+/// contains irreducible sub-SCCs.
+static void findIrreducibleHeaders(
+    const BlockFrequencyInfoImplBase &BFI,
+    const IrreducibleGraph &G,
+    const std::vector<const IrreducibleGraph::IrrNode *> &SCC,
+    LoopData::NodeList &Headers, LoopData::NodeList &Others) {
+  // Map from nodes in the SCC to whether it's an entry block.
+  SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC;
+
+  // InSCC also acts the set of nodes in the graph.  Seed it.
+  for (const auto *I : SCC)
+    InSCC[I] = false;
+
+  for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) {
+    auto &Irr = *I->first;
+    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
+      if (InSCC.count(P))
+        continue;
+
+      // This is an entry block.
+      I->second = true;
+      Headers.push_back(Irr.Node);
+      DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node) << "\n");
+      break;
+    }
+  }
+  assert(Headers.size() >= 2 && "Should be irreducible");
+  if (Headers.size() == InSCC.size()) {
+    // Every block is a header.
+    std::sort(Headers.begin(), Headers.end());
+    return;
+  }
+
+  // Look for extra headers from irreducible sub-SCCs.
+  for (const auto &I : InSCC) {
+    // Entry blocks are already headers.
+    if (I.second)
+      continue;
+
+    auto &Irr = *I.first;
+    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
+      // Skip forward edges.
+      if (P->Node < Irr.Node)
+        continue;
+
+      // Skip predecessors from entry blocks.  These can have inverted
+      // ordering.
+      if (InSCC.lookup(P))
+        continue;
+
+      // Store the extra header.
+      Headers.push_back(Irr.Node);
+      DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node) << "\n");
+      break;
+    }
+    if (Headers.back() == Irr.Node)
+      // Added this as a header.
+      continue;
+
+    // This is not a header.
+    Others.push_back(Irr.Node);
+    DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
+  }
+  std::sort(Headers.begin(), Headers.end());
+  std::sort(Others.begin(), Others.end());
+}
+
+static void createIrreducibleLoop(
+    BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G,
+    LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
+    const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
+  // Translate the SCC into RPO.
+  DEBUG(dbgs() << " - found-scc\n");
+
+  LoopData::NodeList Headers;
+  LoopData::NodeList Others;
+  findIrreducibleHeaders(BFI, G, SCC, Headers, Others);
+
+  auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),
+                                Headers.end(), Others.begin(), Others.end());
+
+  // Update loop hierarchy.
+  for (const auto &N : Loop->Nodes)
+    if (BFI.Working[N.Index].isLoopHeader())
+      BFI.Working[N.Index].Loop->Parent = &*Loop;
+    else
+      BFI.Working[N.Index].Loop = &*Loop;
+}
+
+iterator_range<std::list<LoopData>::iterator>
+BlockFrequencyInfoImplBase::analyzeIrreducible(
+    const IrreducibleGraph &G, LoopData *OuterLoop,
+    std::list<LoopData>::iterator Insert) {
+  assert((OuterLoop == nullptr) == (Insert == Loops.begin()));
+  auto Prev = OuterLoop ? std::prev(Insert) : Loops.end();
+
+  for (auto I = scc_begin(G); !I.isAtEnd(); ++I) {
+    if (I->size() < 2)
+      continue;
+
+    // Translate the SCC into RPO.
+    createIrreducibleLoop(*this, G, OuterLoop, Insert, *I);
+  }
+
+  if (OuterLoop)
+    return make_range(std::next(Prev), Insert);
+  return make_range(Loops.begin(), Insert);
+}
+
+void
+BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
+  OuterLoop.Exits.clear();
+  OuterLoop.BackedgeMass = BlockMass::getEmpty();
+  auto O = OuterLoop.Nodes.begin() + 1;
+  for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
+    if (!Working[I->Index].isPackaged())
+      *O++ = *I;
+  OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
+}
Index: lib/CodeGen/AsmPrinter/AddressPool.h
===================================================================
--- lib/CodeGen/AsmPrinter/AddressPool.h
+++ lib/CodeGen/AsmPrinter/AddressPool.h
@@ -26,6 +26,11 @@
     AddressPoolEntry(unsigned Number, bool TLS) : Number(Number), TLS(TLS) {}
   };
   DenseMap<const MCSymbol *, AddressPoolEntry> Pool;
+
+  /// Record whether the AddressPool has been queried for an address index since
+  /// the last "resetUsedFlag" call. Used to implement type unit fallback - a
+  /// type that references addresses cannot be placed in a type unit when using
+  /// fission.
   bool HasBeenUsed;
 
 public:
Index: lib/CodeGen/AsmPrinter/DIE.h
===================================================================
--- lib/CodeGen/AsmPrinter/DIE.h
+++ lib/CodeGen/AsmPrinter/DIE.h
@@ -124,6 +124,12 @@
 
   /// Children DIEs.
   ///
+  // This can't be a vector<DIE> because pointer validity is requirent for the
+  // Parent pointer and DIEEntry.
+  // It can't be a list<DIE> because some clients need pointer validity before
+  // the object has been added to any child list
+  // (eg: DwarfUnit::constructVariableDIE). These aren't insurmountable, but may
+  // be more convoluted than beneficial.
   std::vector<std::unique_ptr<DIE>> Children;
 
   DIE *Parent;
Index: lib/CodeGen/AsmPrinter/DwarfDebug.h
===================================================================
--- lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -359,14 +359,19 @@
 
   /// \brief Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
-  DIE *constructLexicalScopeDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
+                                                LexicalScope *Scope);
 
   /// \brief This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
-  DIE *constructInlinedScopeDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
+                                                LexicalScope *Scope);
 
   /// \brief Construct a DIE for this scope.
-  DIE *constructScopeDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructScopeDIE(DwarfCompileUnit &TheCU,
+                                         LexicalScope *Scope);
+  /// \brief Construct a DIE for this scope.
+  DIE *constructSubprogramScopeDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope);
   /// A helper function to create children of a Scope DIE.
   DIE *createScopeChildrenDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope,
                               SmallVectorImpl<std::unique_ptr<DIE>> &Children);
@@ -494,11 +499,11 @@
 
   /// \brief Construct import_module DIE.
   void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N,
-                                  DIE *Context);
+                                  DIE &Context);
 
   /// \brief Construct import_module DIE.
   void constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                  const DIImportedEntity &Module, DIE *Context);
+                                  const DIImportedEntity &Module, DIE &Context);
 
   /// \brief Register a source line with debug info. Returns the unique
   /// label that was emitted and which provides correspondence to the
Index: lib/CodeGen/AsmPrinter/DwarfDebug.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -118,7 +118,6 @@
   return Var.isBlockByrefVariable(DD->getTypeIdentifierMap());
 }
 
-
 DIType DbgVariable::getType() const {
   DIType Ty = Var.getType().resolve(DD->getTypeIdentifierMap());
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
@@ -210,9 +209,8 @@
   else
     HasDwarfPubSections = DwarfPubSections == Enable;
 
-  DwarfVersion = DwarfVersionNumber
-                     ? DwarfVersionNumber
-                     : MMI->getModule()->getDwarfVersion();
+  DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
+                                    : MMI->getModule()->getDwarfVersion();
 
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
@@ -421,12 +419,13 @@
 
 // Construct new DW_TAG_lexical_block for this scope and attach
 // DW_AT_low_pc/DW_AT_high_pc labels.
-DIE *DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
-                                          LexicalScope *Scope) {
+std::unique_ptr<DIE>
+DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
+                                     LexicalScope *Scope) {
   if (isLexicalScopeDIENull(Scope))
     return nullptr;
 
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
@@ -454,8 +453,9 @@
 
 // This scope represents inlined body of a function. Construct DIE to
 // represent this concrete inlined copy of the function.
-DIE *DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
-                                          LexicalScope *Scope) {
+std::unique_ptr<DIE>
+DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
+                                     LexicalScope *Scope) {
   const SmallVectorImpl<InsnRange> &ScopeRanges = Scope->getRanges();
   assert(!ScopeRanges.empty() &&
          "LexicalScope does not have instruction markers!");
@@ -470,7 +470,7 @@
     return nullptr;
   }
 
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
   TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
 
   // If we have multiple ranges, emit them into the range section.
@@ -539,38 +539,74 @@
       ObjectPointer = Children.back().get();
   }
   for (LexicalScope *LS : Scope->getChildren())
-    if (DIE *Nested = constructScopeDIE(TheCU, LS))
-      Children.push_back(std::unique_ptr<DIE>(Nested));
+    if (std::unique_ptr<DIE> Nested = constructScopeDIE(TheCU, LS))
+      Children.push_back(std::move(Nested));
   return ObjectPointer;
 }
 
+DIE *DwarfDebug::constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
+                                             LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+
+  DIScope DS(Scope->getScopeNode());
+
+  assert(!Scope->getInlinedAt());
+  assert(DS.isSubprogram());
+
+  ProcessedSPNodes.insert(DS);
+
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+  DIE *ScopeDIE;
+
+  if (Scope->isAbstractScope()) {
+    ScopeDIE = TheCU.getDIE(DS);
+    // Note down abstract DIE.
+    if (ScopeDIE)
+      AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
+    else {
+      assert(Children.empty() &&
+             "We create children only when the scope DIE is not null.");
+      return nullptr;
+    }
+  } else
+    ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+
+  // We create children when the scope DIE is not null.
+  if (DIE *ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children))
+    TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE->addChild(std::move(I));
+
+  return ScopeDIE;
+}
+
 // Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(DwarfCompileUnit &TheCU,
-                                   LexicalScope *Scope) {
+std::unique_ptr<DIE> DwarfDebug::constructScopeDIE(DwarfCompileUnit &TheCU,
+                                                   LexicalScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
     return nullptr;
 
   DIScope DS(Scope->getScopeNode());
 
+  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
   SmallVector<std::unique_ptr<DIE>, 8> Children;
-  DIE *ObjectPointer = nullptr;
-  bool ChildrenCreated = false;
 
   // We try to create the scope DIE first, then the children DIEs. This will
   // avoid creating un-used children then removing them later when we find out
   // the scope DIE is null.
-  DIE *ScopeDIE = nullptr;
-  if (Scope->getInlinedAt())
+  std::unique_ptr<DIE> ScopeDIE;
+  if (Scope->getInlinedAt()) {
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
-  else if (DS.isSubprogram()) {
-    ProcessedSPNodes.insert(DS);
-    if (Scope->isAbstractScope()) {
-      ScopeDIE = TheCU.getDIE(DS);
-      // Note down abstract DIE.
-      if (ScopeDIE)
-        AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
-    } else
-      ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+    if (!ScopeDIE)
+      return nullptr;
+    // We create children when the scope DIE is not null.
+    createScopeChildrenDIE(TheCU, Scope, Children);
   } else {
     // Early exit when we know the scope DIE is going to be null.
     if (isLexicalScopeDIENull(Scope))
@@ -578,42 +614,28 @@
 
     // We create children here when we know the scope DIE is not going to be
     // null and the children will be added to the scope DIE.
-    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
-    ChildrenCreated = true;
+    createScopeChildrenDIE(TheCU, Scope, Children);
 
     // There is no need to emit empty lexical block DIE.
     std::pair<ImportedEntityMap::const_iterator,
               ImportedEntityMap::const_iterator> Range =
-        std::equal_range(
-            ScopesWithImportedEntities.begin(),
-            ScopesWithImportedEntities.end(),
-            std::pair<const MDNode *, const MDNode *>(DS, nullptr),
-            less_first());
+        std::equal_range(ScopesWithImportedEntities.begin(),
+                         ScopesWithImportedEntities.end(),
+                         std::pair<const MDNode *, const MDNode *>(DS, nullptr),
+                         less_first());
     if (Children.empty() && Range.first == Range.second)
       return nullptr;
     ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
     assert(ScopeDIE && "Scope DIE should not be null.");
     for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
          ++i)
-      constructImportedEntityDIE(TheCU, i->second, ScopeDIE);
-  }
-
-  if (!ScopeDIE) {
-    assert(Children.empty() &&
-           "We create children only when the scope DIE is not null.");
-    return nullptr;
+      constructImportedEntityDIE(TheCU, i->second, *ScopeDIE);
   }
-  if (!ChildrenCreated)
-    // We create children when the scope DIE is not null.
-    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
 
   // Add children
   for (auto &I : Children)
     ScopeDIE->addChild(std::move(I));
 
-  if (DS.isSubprogram() && ObjectPointer != nullptr)
-    TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
-
   return ScopeDIE;
 }
 
@@ -630,10 +652,10 @@
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
-  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
   auto OwnedUnit = make_unique<DwarfCompileUnit>(
-      InfoHolder.getUnits().size(), Die, DIUnit, Asm, this, &InfoHolder);
+      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
+  DIE &Die = NewCU.getUnitDie();
   InfoHolder.addUnit(std::move(OwnedUnit));
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
@@ -644,10 +666,10 @@
     Asm->OutStreamer.getContext().setMCLineTableCompilationDir(
         NewCU.getUniqueID(), CompilationDir);
 
-  NewCU.addString(*Die, dwarf::DW_AT_producer, DIUnit.getProducer());
-  NewCU.addUInt(*Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit.getProducer());
+  NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 DIUnit.getLanguage());
-  NewCU.addString(*Die, dwarf::DW_AT_name, FN);
+  NewCU.addString(Die, dwarf::DW_AT_name, FN);
 
   if (!useSplitDwarf()) {
     NewCU.initStmtList(DwarfLineSectionSym);
@@ -655,20 +677,20 @@
     // If we're using split dwarf the compilation dir is going to be in the
     // skeleton CU and so we don't need to duplicate it here.
     if (!CompilationDir.empty())
-      NewCU.addString(*Die, dwarf::DW_AT_comp_dir, CompilationDir);
+      NewCU.addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
-    addGnuPubAttributes(NewCU, *Die);
+    addGnuPubAttributes(NewCU, Die);
   }
 
   if (DIUnit.isOptimized())
-    NewCU.addFlag(*Die, dwarf::DW_AT_APPLE_optimized);
+    NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
-    NewCU.addString(*Die, dwarf::DW_AT_APPLE_flags, Flags);
+    NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
 
   if (unsigned RVer = DIUnit.getRunTimeVersion())
-    NewCU.addUInt(*Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+    NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
                   dwarf::DW_FORM_data1, RVer);
 
   if (!FirstCU)
@@ -683,7 +705,7 @@
                       DwarfInfoSectionSym);
 
   CUMap.insert(std::make_pair(DIUnit, &NewCU));
-  CUDieMap.insert(std::make_pair(Die, &NewCU));
+  CUDieMap.insert(std::make_pair(&Die, &NewCU));
   return NewCU;
 }
 
@@ -716,11 +738,11 @@
   DIImportedEntity Module(N);
   assert(Module.Verify());
   if (DIE *D = TheCU.getOrCreateContextDIE(Module.getContext()))
-    constructImportedEntityDIE(TheCU, Module, D);
+    constructImportedEntityDIE(TheCU, Module, *D);
 }
 
 void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const MDNode *N, DIE *Context) {
+                                            const MDNode *N, DIE &Context) {
   DIImportedEntity Module(N);
   assert(Module.Verify());
   return constructImportedEntityDIE(TheCU, Module, Context);
@@ -728,11 +750,10 @@
 
 void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
                                             const DIImportedEntity &Module,
-                                            DIE *Context) {
+                                            DIE &Context) {
   assert(Module.Verify() &&
          "Use one of the MDNode * overloads to handle invalid metadata");
-  assert(Context && "Should always have a context for an imported_module");
-  DIE &IMDie = TheCU.createAndAddDIE(Module.getTag(), *Context, Module);
+  DIE &IMDie = TheCU.createAndAddDIE(Module.getTag(), Context, Module);
   DIE *EntityDie;
   DIDescriptor Entity = resolve(Module.getEntity());
   if (Entity.isNameSpace())
@@ -1655,10 +1676,10 @@
       }
     }
     if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
-      constructScopeDIE(TheCU, AScope);
+      constructSubprogramScopeDIE(TheCU, AScope);
   }
 
-  DIE &CurFnDIE = *constructScopeDIE(TheCU, FnScope);
+  DIE &CurFnDIE = *constructSubprogramScopeDIE(TheCU, FnScope);
   if (!CurFn->getTarget().Options.DisableFramePointerElim(*CurFn))
     TheCU.addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
@@ -2051,7 +2072,7 @@
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
   assert(Entry.getValues().size() == 1 &&
-	 "multi-value entries are not supported yet.");
+         "multi-value entries are not supported yet.");
   const DebugLocEntry::Value Value = Entry.getValues()[0];
   DIVariable DV(Value.getVariable());
   if (Value.isInt()) {
@@ -2185,7 +2206,7 @@
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfARangesSection());
 
-  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan> > SpansType;
+  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> SpansType;
 
   SpansType Spans;
 
@@ -2334,9 +2355,6 @@
   for (const auto &I : CUMap) {
     DwarfCompileUnit *TheCU = I.second;
 
-    // Emit a symbol so we can find the beginning of our ranges.
-    Asm->OutStreamer.EmitLabel(TheCU->getLabelRange());
-
     // Iterate over the misc ranges for the compile units in the module.
     for (const RangeSpanList &List : TheCU->getRangeLists()) {
       // Emit our symbol so we can find the beginning of the range.
@@ -2402,16 +2420,15 @@
 // DW_AT_addr_base, DW_AT_ranges_base.
 DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
-  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
   auto OwnedUnit = make_unique<DwarfCompileUnit>(
-      CU.getUniqueID(), Die, CU.getCUNode(), Asm, this, &SkeletonHolder);
+      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
                     DwarfInfoSectionSym);
 
   NewCU.initStmtList(DwarfLineSectionSym);
 
-  initSkeletonUnit(CU, *Die, std::move(OwnedUnit));
+  initSkeletonUnit(CU, NewCU.getUnitDie(), std::move(OwnedUnit));
 
   return NewCU;
 }
@@ -2422,16 +2439,15 @@
   DwarfCompileUnit &CU = static_cast<DwarfCompileUnit &>(
       *SkeletonHolder.getUnits()[TU.getCU().getUniqueID()]);
 
-  DIE *Die = new DIE(dwarf::DW_TAG_type_unit);
-  auto OwnedUnit = make_unique<DwarfTypeUnit>(TU.getUniqueID(), Die, CU, Asm,
-                                              this, &SkeletonHolder);
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(TU.getUniqueID(), CU, Asm, this,
+                                              &SkeletonHolder);
   DwarfTypeUnit &NewTU = *OwnedUnit;
   NewTU.setTypeSignature(TU.getTypeSignature());
   NewTU.setType(nullptr);
   NewTU.initSection(
       Asm->getObjFileLowering().getDwarfTypesSection(TU.getTypeSignature()));
 
-  initSkeletonUnit(TU, *Die, std::move(OwnedUnit));
+  initSkeletonUnit(TU, NewTU.getUnitDie(), std::move(OwnedUnit));
   return NewTU;
 }
 
@@ -2441,7 +2457,7 @@
   assert(useSplitDwarf() && "No split dwarf debug info?");
   // Don't pass an abbrev symbol, using a constant zero instead so as not to
   // emit relocations into the dwo file.
-  InfoHolder.emitUnits(this, /* AbbrevSymbol */nullptr);
+  InfoHolder.emitUnits(this, /* AbbrevSymbol */ nullptr);
 }
 
 // Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
@@ -2507,22 +2523,23 @@
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  DIE *UnitDie = new DIE(dwarf::DW_TAG_type_unit);
   auto OwnedUnit =
-      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), UnitDie, CU, Asm,
-                                 this, &InfoHolder, getDwoLineTable(CU));
+      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), CU, Asm, this,
+                                 &InfoHolder, getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
+  DIE &UnitDie = NewTU.getUnitDie();
   TU = &NewTU;
-  TypeUnitsUnderConstruction.push_back(std::make_pair(std::move(OwnedUnit), CTy));
+  TypeUnitsUnderConstruction.push_back(
+      std::make_pair(std::move(OwnedUnit), CTy));
 
-  NewTU.addUInt(*UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+  NewTU.addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 CU.getLanguage());
 
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
 
   if (!useSplitDwarf())
-    CU.applyStmtList(*UnitDie);
+    CU.applyStmtList(UnitDie);
 
   NewTU.initSection(
       useSplitDwarf()
Index: lib/CodeGen/AsmPrinter/DwarfUnit.h
===================================================================
--- lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -73,7 +73,7 @@
   DICompileUnit CUNode;
 
   /// Unit debug information entry.
-  const std::unique_ptr<DIE> UnitDie;
+  DIE UnitDie;
 
   /// Offset of the UnitDie from beginning of debug info section.
   unsigned DebugInfoOffset;
@@ -138,13 +138,10 @@
   /// The end of the unit within its section.
   MCSymbol *LabelEnd;
 
-  /// The label for the start of the range sets for the elements of this unit.
-  MCSymbol *LabelRange;
-
   /// Skeleton unit associated with this unit.
   DwarfUnit *Skeleton;
 
-  DwarfUnit(unsigned UID, DIE *D, DICompileUnit CU, AsmPrinter *A,
+  DwarfUnit(unsigned UID, dwarf::Tag, DICompileUnit CU, AsmPrinter *A,
             DwarfDebug *DW, DwarfFile *DWU);
 
 public:
@@ -167,7 +164,6 @@
         Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
     this->LabelEnd =
         Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-    this->LabelRange = Asm->GetTempSymbol("gnu_ranges", getUniqueID());
   }
 
   const MCSection *getSection() const {
@@ -206,16 +202,11 @@
     return LabelEnd;
   }
 
-  MCSymbol *getLabelRange() const {
-    assert(Section);
-    return LabelRange;
-  }
-
   // Accessors.
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
-  DIE &getUnitDie() const { return *UnitDie; }
+  DIE &getUnitDie() { return UnitDie; }
   const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
   const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
@@ -223,7 +214,7 @@
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
 
   /// hasContent - Return true if this compile unit has something to write out.
-  bool hasContent() const { return !UnitDie->getChildren().empty(); }
+  bool hasContent() const { return !UnitDie.getChildren().empty(); }
 
   /// addRange - Add an address range to the list of ranges for this unit.
   void addRange(RangeSpan Range);
@@ -533,7 +524,7 @@
   unsigned stmtListIndex;
 
 public:
-  DwarfCompileUnit(unsigned UID, DIE *D, DICompileUnit Node, AsmPrinter *A,
+  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
 
   void initStmtList(MCSymbol *DwarfLineSectionSym);
@@ -567,7 +558,7 @@
   MCDwarfDwoLineTable *SplitLineTable;
 
 public:
-  DwarfTypeUnit(unsigned UID, DIE *D, DwarfCompileUnit &CU, AsmPrinter *A,
+  DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                 DwarfDebug *DW, DwarfFile *DWU,
                 MCDwarfDwoLineTable *SplitLineTable = nullptr);
 
Index: lib/CodeGen/AsmPrinter/DwarfUnit.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -41,28 +41,30 @@
                        cl::init(false));
 
 /// Unit - Unit constructor.
-DwarfUnit::DwarfUnit(unsigned UID, DIE *D, DICompileUnit Node, AsmPrinter *A,
-                     DwarfDebug *DW, DwarfFile *DWU)
-    : UniqueID(UID), CUNode(Node), UnitDie(D), DebugInfoOffset(0), Asm(A),
+DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
+                     AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
+    : UniqueID(UID), CUNode(Node), UnitDie(UnitTag), DebugInfoOffset(0), Asm(A),
       DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr),
       Skeleton(nullptr) {
+  assert(UnitTag == dwarf::DW_TAG_compile_unit ||
+         UnitTag == dwarf::DW_TAG_type_unit);
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
 
-DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DIE *D, DICompileUnit Node,
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
                                    AsmPrinter *A, DwarfDebug *DW,
                                    DwarfFile *DWU)
-    : DwarfUnit(UID, D, Node, A, DW, DWU) {
-  insertDIE(Node, D);
+    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU) {
+  insertDIE(Node, &getUnitDie());
 }
 
-DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DIE *D, DwarfCompileUnit &CU,
-                             AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU,
+DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
+                             DwarfDebug *DW, DwarfFile *DWU,
                              MCDwarfDwoLineTable *SplitLineTable)
-    : DwarfUnit(UID, D, CU.getCUNode(), A, DW, DWU), CU(CU),
-      SplitLineTable(SplitLineTable) {
+    : DwarfUnit(UID, dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU),
+      CU(CU), SplitLineTable(SplitLineTable) {
   if (SplitLineTable)
-    addSectionOffset(*UnitDie, dwarf::DW_AT_stmt_list, 0);
+    addSectionOffset(UnitDie, dwarf::DW_AT_stmt_list, 0);
 }
 
 /// ~Unit - Destructor for compile unit.
@@ -1423,7 +1425,7 @@
   DISubprogram SPDecl = SP.getFunctionDeclaration();
   if (SPDecl.isSubprogram())
     // Add subprogram definitions to the CU die directly.
-    ContextDIE = UnitDie.get();
+    ContextDIE = &getUnitDie();
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
   DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
@@ -1643,7 +1645,7 @@
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
         !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
       // Create specification DIE.
-      VariableSpecDIE = &createAndAddDIE(dwarf::DW_TAG_variable, *UnitDie);
+      VariableSpecDIE = &createAndAddDIE(dwarf::DW_TAG_variable, UnitDie);
       addDIEEntry(*VariableSpecDIE, dwarf::DW_AT_specification, *VariableDIE);
       addBlock(*VariableSpecDIE, dwarf::DW_AT_location, Loc);
       // A static member's declaration is already flagged as such.
@@ -1741,7 +1743,7 @@
   DIE *IdxTy = getIndexTyDie();
   if (!IdxTy) {
     // Construct an integer type to use for indexes.
-    IdxTy = &createAndAddDIE(dwarf::DW_TAG_base_type, *UnitDie);
+    IdxTy = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
     addString(*IdxTy, dwarf::DW_AT_name, "sizetype");
     addUInt(*IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
     addUInt(*IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
@@ -2048,7 +2050,7 @@
   MCSymbol *LineTableStartSym =
       Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
 
-  stmtListIndex = UnitDie->getValues().size();
+  stmtListIndex = UnitDie.getValues().size();
 
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section. For split dwarf this is
@@ -2056,16 +2058,16 @@
   // The line table entries are not always emitted in assembly, so it
   // is not okay to use line_table_start here.
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    addSectionLabel(*UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym);
+    addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym);
   else
-    addSectionDelta(*UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
+    addSectionDelta(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
                     DwarfLineSectionSym);
 }
 
 void DwarfCompileUnit::applyStmtList(DIE &D) {
   D.addValue(dwarf::DW_AT_stmt_list,
-             UnitDie->getAbbrev().getData()[stmtListIndex].getForm(),
-             UnitDie->getValues()[stmtListIndex]);
+             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
+             UnitDie.getValues()[stmtListIndex]);
 }
 
 void DwarfTypeUnit::emitHeader(const MCSymbol *ASectionSym) const {
@@ -2090,5 +2092,4 @@
       Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
   this->LabelEnd =
       Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-  this->LabelRange = Asm->GetTempSymbol("gnu_ranges", getUniqueID());
 }
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3711,8 +3711,7 @@
       BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
                                RHS);
       TopHalf = BottomHalf.getValue(1);
-    } else if (TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(),
-                                                 VT.getSizeInBits() * 2))) {
+    } else if (TLI.isTypeLegal(WideVT)) {
       LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
       RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
       Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
Index: lib/DebugInfo/DWARFDebugArangeSet.h
===================================================================
--- lib/DebugInfo/DWARFDebugArangeSet.h
+++ lib/DebugInfo/DWARFDebugArangeSet.h
@@ -63,7 +63,6 @@
     return desc_iterator_range(ArangeDescriptors.begin(),
                                ArangeDescriptors.end());
   }
-  uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
 };
 
 }
Index: lib/DebugInfo/DWARFDebugAranges.cpp
===================================================================
--- lib/DebugInfo/DWARFDebugAranges.cpp
+++ lib/DebugInfo/DWARFDebugAranges.cpp
@@ -21,23 +21,11 @@
   if (!DebugArangesData.isValidOffset(0))
     return;
   uint32_t Offset = 0;
-  typedef std::vector<DWARFDebugArangeSet> RangeSetColl;
-  RangeSetColl Sets;
   DWARFDebugArangeSet Set;
-  uint32_t TotalRanges = 0;
 
   while (Set.extract(DebugArangesData, &Offset)) {
-    Sets.push_back(Set);
-    TotalRanges += Set.getNumDescriptors();
-  }
-  if (TotalRanges == 0)
-    return;
-
-  Aranges.reserve(TotalRanges);
-  for (const auto &I : Sets) {
-    uint32_t CUOffset = I.getCompileUnitDIEOffset();
-
-    for (const auto &Desc : I.descriptors()) {
+    uint32_t CUOffset = Set.getCompileUnitDIEOffset();
+    for (const auto &Desc : Set.descriptors()) {
       uint64_t LowPC = Desc.Address;
       uint64_t HighPC = Desc.getEndAddress();
       appendRange(CUOffset, LowPC, HighPC);
@@ -112,11 +100,6 @@
       ++minimal_size;
   }
 
-  // If the sizes are the same, then no consecutive aranges can be
-  // combined, we are done.
-  if (minimal_size == orig_arange_size)
-    return;
-
   // Else, make a new RangeColl that _only_ contains what we need.
   RangeColl minimal_aranges;
   minimal_aranges.resize(minimal_size);
Index: lib/DebugInfo/DWARFDebugFrame.h
===================================================================
--- lib/DebugInfo/DWARFDebugFrame.h
+++ lib/DebugInfo/DWARFDebugFrame.h
@@ -12,14 +12,13 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #include <vector>
 
-
 namespace llvm {
 
 class FrameEntry;
 
-
 /// \brief A parsed .debug_frame section
 ///
 class DWARFDebugFrame {
@@ -35,8 +34,7 @@
   void parse(DataExtractor Data);
 
 private:
-  typedef std::vector<FrameEntry *> EntryVector;
-  EntryVector Entries;
+  std::vector<std::unique_ptr<FrameEntry>> Entries;
 };
 
 
Index: lib/DebugInfo/DWARFDebugFrame.cpp
===================================================================
--- lib/DebugInfo/DWARFDebugFrame.cpp
+++ lib/DebugInfo/DWARFDebugFrame.cpp
@@ -26,8 +26,8 @@
 class llvm::FrameEntry {
 public:
   enum FrameKind {FK_CIE, FK_FDE};
-  FrameEntry(FrameKind K, DataExtractor D, uint64_t Offset, uint64_t Length)
-    : Kind(K), Data(D), Offset(Offset), Length(Length) {}
+  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length)
+      : Kind(K), Offset(Offset), Length(Length) {}
 
   virtual ~FrameEntry() {
   }
@@ -35,11 +35,12 @@
   FrameKind getKind() const { return Kind; }
   virtual uint64_t getOffset() const { return Offset; }
 
-  /// \brief Parse and store a sequence of CFI instructions from our data
-  /// stream, starting at *Offset and ending at EndOffset. If everything
+  /// \brief Parse and store a sequence of CFI instructions from Data,
+  /// starting at *Offset and ending at EndOffset. If everything
   /// goes well, *Offset should be equal to EndOffset when this method
   /// returns. Otherwise, an error occurred.
-  virtual void parseInstructions(uint32_t *Offset, uint32_t EndOffset);
+  virtual void parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                 uint32_t EndOffset);
 
   /// \brief Dump the entry header to the given output stream.
   virtual void dumpHeader(raw_ostream &OS) const = 0;
@@ -50,10 +51,6 @@
 protected:
   const FrameKind Kind;
 
-  /// \brief The data stream holding the section from which the entry was
-  /// parsed.
-  DataExtractor Data;
-
   /// \brief Offset of this entry in the section.
   uint64_t Offset;
 
@@ -97,8 +94,8 @@
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
 
-
-void FrameEntry::parseInstructions(uint32_t *Offset, uint32_t EndOffset) {
+void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                   uint32_t EndOffset) {
   while (*Offset < EndOffset) {
     uint8_t Opcode = Data.getU8(Offset);
     // Some instructions have a primary opcode encoded in the top bits.
@@ -201,13 +198,13 @@
 public:
   // CIEs (and FDEs) are simply container classes, so the only sensible way to
   // create them is by providing the full parsed contents in the constructor.
-  CIE(DataExtractor D, uint64_t Offset, uint64_t Length, uint8_t Version,
+  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
       SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
       int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
-   : FrameEntry(FK_CIE, D, Offset, Length), Version(Version),
-     Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
-     DataAlignmentFactor(DataAlignmentFactor),
-     ReturnAddressRegister(ReturnAddressRegister) {}
+      : FrameEntry(FK_CIE, Offset, Length), Version(Version),
+        Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
+        DataAlignmentFactor(DataAlignmentFactor),
+        ReturnAddressRegister(ReturnAddressRegister) {}
 
   ~CIE() {
   }
@@ -229,7 +226,7 @@
 
   static bool classof(const FrameEntry *FE) {
     return FE->getKind() == FK_CIE;
-  } 
+  }
 
 private:
   /// The following fields are defined in section 6.4.1 of the DWARF standard v3
@@ -247,11 +244,11 @@
   // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
   // an offset to the CIE (provided by parsing the FDE header). The CIE itself
   // is obtained lazily once it's actually required.
-  FDE(DataExtractor D, uint64_t Offset, uint64_t Length,
-      int64_t LinkedCIEOffset, uint64_t InitialLocation, uint64_t AddressRange)
-   : FrameEntry(FK_FDE, D, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
-     InitialLocation(InitialLocation), AddressRange(AddressRange),
-     LinkedCIE(nullptr) {}
+  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
+      uint64_t InitialLocation, uint64_t AddressRange)
+      : FrameEntry(FK_FDE, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
+        InitialLocation(InitialLocation), AddressRange(AddressRange),
+        LinkedCIE(nullptr) {}
 
   ~FDE() {
   }
@@ -270,9 +267,9 @@
 
   static bool classof(const FrameEntry *FE) {
     return FE->getKind() == FK_FDE;
-  } 
-private:
+  }
 
+private:
   /// The following fields are defined in section 6.4.1 of the DWARF standard v3
   uint64_t LinkedCIEOffset;
   uint64_t InitialLocation;
@@ -285,14 +282,9 @@
 DWARFDebugFrame::DWARFDebugFrame() {
 }
 
-
 DWARFDebugFrame::~DWARFDebugFrame() {
-  for (const auto &Entry : Entries) {
-    delete Entry;
-  }
 }
 
-
 static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
                                               uint32_t Offset, int Length) {
   errs() << "DUMP: ";
@@ -334,7 +326,6 @@
     Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
     bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
 
-    FrameEntry *Entry = nullptr;
     if (IsCIE) {
       // Note: this is specifically DWARFv3 CIE header structure. It was
       // changed in DWARFv4. We currently don't support reading DWARFv4
@@ -346,30 +337,25 @@
       int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
       uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
 
-      Entry = new CIE(Data, StartOffset, Length, Version,
-                      StringRef(Augmentation), CodeAlignmentFactor,
-                      DataAlignmentFactor, ReturnAddressRegister);
+      Entries.emplace_back(new CIE(StartOffset, Length, Version,
+                                   StringRef(Augmentation), CodeAlignmentFactor,
+                                   DataAlignmentFactor, ReturnAddressRegister));
     } else {
       // FDE
       uint64_t CIEPointer = Id;
       uint64_t InitialLocation = Data.getAddress(&Offset);
       uint64_t AddressRange = Data.getAddress(&Offset);
 
-      Entry = new FDE(Data, StartOffset, Length, CIEPointer,
-                      InitialLocation, AddressRange);
+      Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
+                                   InitialLocation, AddressRange));
     }
 
-    assert(Entry && "Expected Entry to be populated with CIE or FDE");
-    Entry->parseInstructions(&Offset, EndStructureOffset);
+    Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
 
-    if (Offset == EndStructureOffset) {
-      // Entry instrucitons parsed successfully.
-      Entries.push_back(Entry);
-    } else {
+    if (Offset != EndStructureOffset) {
       std::string Str;
       raw_string_ostream OS(Str);
-      OS << format("Parsing entry instructions at %lx failed",
-                   Entry->getOffset());
+      OS << format("Parsing entry instructions at %lx failed", StartOffset);
       report_fatal_error(Str);
     }
   }
Index: lib/MC/ELFObjectWriter.cpp
===================================================================
--- lib/MC/ELFObjectWriter.cpp
+++ lib/MC/ELFObjectWriter.cpp
@@ -496,7 +496,7 @@
   if (Symbol->isVariable()) {
     const MCExpr *Expr = Symbol->getVariableValue();
     MCValue Value;
-    if (!Expr->EvaluateAsRelocatable(Value, &Layout))
+    if (!Expr->EvaluateAsValue(Value, &Layout))
       llvm_unreachable("Invalid expression");
 
     assert(!Value.getSymB());
@@ -606,7 +606,7 @@
 
   const MCExpr *Expr = Symbol.getVariableValue();
   MCValue Value;
-  if (!Expr->EvaluateAsRelocatable(Value, &Layout))
+  if (!Expr->EvaluateAsValue(Value, &Layout))
     llvm_unreachable("Invalid Expression");
   const MCSymbolRefExpr *RefB = Value.getSymB();
   if (RefB) {
Index: lib/MC/MCExpr.cpp
===================================================================
--- lib/MC/MCExpr.cpp
+++ lib/MC/MCExpr.cpp
@@ -478,7 +478,8 @@
   // absolutize differences across sections and that is what the MachO writer
   // uses Addrs for.
   bool IsRelocatable =
-    EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs);
+      EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs,
+                                /*ForceVarExpansion*/ false);
 
   // Record the current value.
   Res = Value.getConstant();
@@ -629,14 +630,20 @@
 bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
                                    const MCAsmLayout *Layout) const {
   MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
-  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false);
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
+                                   /*ForceVarExpansion*/ false);
 }
 
-bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                       const MCAssembler *Asm,
+bool MCExpr::EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const {
+  MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
+                                   /*ForceVarExpansion*/ true);
+}
+
+bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                        const MCAsmLayout *Layout,
-                                       const SectionAddrMap *Addrs,
-                                       bool InSet) const {
+                                       const SectionAddrMap *Addrs, bool InSet,
+                                       bool ForceVarExpansion) const {
   ++stats::MCExprEvaluate;
 
   switch (getKind()) {
@@ -649,13 +656,14 @@
 
   case SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
+    bool IsWeakRef = SRE->getKind() == MCSymbolRefExpr::VK_WEAKREF;
     const MCSymbol &Sym = SRE->getSymbol();
     const MCAsmInfo &MCAsmInfo = SRE->getMCAsmInfo();
 
     // Evaluate recursively if this is a variable.
-    if (Sym.isVariable()) {
-      if (Sym.getVariableValue()->EvaluateAsRelocatableImpl(Res, Asm, Layout,
-                                                            Addrs, true)) {
+    if (Sym.isVariable() && !IsWeakRef) {
+      if (Sym.getVariableValue()->EvaluateAsRelocatableImpl(
+              Res, Asm, Layout, Addrs, true, ForceVarExpansion)) {
         const MCSymbolRefExpr *A = Res.getSymA();
         const MCSymbolRefExpr *B = Res.getSymB();
 
@@ -669,9 +677,10 @@
           if (!A && !B)
             return true;
         } else {
+          if (ForceVarExpansion)
+            return true;
           bool IsSymbol = A && A->getSymbol().isDefined();
-          bool IsWeakRef = SRE->getKind() == MCSymbolRefExpr::VK_WEAKREF;
-          if (!IsSymbol && !IsWeakRef)
+          if (!IsSymbol)
             return true;
         }
       }
@@ -685,8 +694,8 @@
     const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
     MCValue Value;
 
-    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout,
-                                                      Addrs, InSet))
+    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs,
+                                                      InSet, ForceVarExpansion))
       return false;
 
     switch (AUE->getOpcode()) {
@@ -719,10 +728,10 @@
     const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
     MCValue LHSValue, RHSValue;
 
-    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout,
-                                                  Addrs, InSet) ||
-        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout,
-                                                  Addrs, InSet))
+    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout, Addrs,
+                                                  InSet, ForceVarExpansion) ||
+        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout, Addrs,
+                                                  InSet, ForceVarExpansion))
       return false;
 
     // We only support a few operations on non-constant expressions, handle
Index: lib/Target/ARM64/ARM64ISelLowering.cpp
===================================================================
--- lib/Target/ARM64/ARM64ISelLowering.cpp
+++ lib/Target/ARM64/ARM64ISelLowering.cpp
@@ -3981,39 +3981,34 @@
 // vector sources of the shuffle are different.
 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
                       unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-  ReverseEXT = false;
-
-  // Look for the first non-undef choice and count backwards from
-  // that. E.g. <-1, -1, 3, ...> means that an EXT must start at 3 - 2 = 1. This
-  // guarantees that at least one index is correct.
-  const int *FirstRealElt =
-      std::find_if(M.begin(), M.end(), [](int Elt) { return Elt >= 0; });
-  assert(FirstRealElt != M.end() && "Completely UNDEF shuffle? Why bother?");
-  Imm = *FirstRealElt - (FirstRealElt - M.begin());
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
 
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, it may still be
-    // a VEXT but the source vectors must be swapped.
-    ExpectedElt += 1;
-    if (ExpectedElt == NumElts * 2) {
-      ExpectedElt = 0;
-      ReverseEXT = true;
-    }
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
 
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.
+  // E.g. <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  //      <-1, -1, 0, 1, ...> is treated as <IDX, IDX+1, 0, 1, ...>. IDX is
+  // equal to the ExpectedElt. For this case, ExpectedElt is (NumElts*2 - 2).
+  Imm = (M[0] >= 0) ? static_cast<unsigned>(M[0]) : ExpectedElt.getZExtValue();
 
   // Adjust the index value if the source operands will be swapped.
-  if (ReverseEXT)
+  if (Imm >= NumElts) {
+    ReverseEXT = true;
     Imm -= NumElts;
+  }
 
   return true;
 }
Index: lib/Target/PowerPC/PPCFrameLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCFrameLowering.cpp
+++ lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -222,7 +222,7 @@
   if (!DisableRedZone &&
       (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
        !Subtarget.isSVR4ABI() ||                   //   allocated locals.
-	FrameSize == 0) &&
+        FrameSize == 0) &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
@@ -281,8 +281,8 @@
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                     Attribute::Naked))
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -426,7 +426,8 @@
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -562,13 +563,14 @@
     assert(NegFrameSize);
     unsigned CFIIndex = MMI.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
 
     if (HasFP) {
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -576,7 +578,7 @@
       unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -584,7 +586,7 @@
       unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -601,7 +603,7 @@
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -629,7 +631,7 @@
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
             nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8));
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
       }
@@ -637,7 +639,7 @@
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -712,7 +714,8 @@
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -930,9 +933,9 @@
     MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
-  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the 
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
-  if (!isPPC64 && !isDarwinABI && 
+  if (!isPPC64 && !isDarwinABI &&
       (MRI.isPhysRegUsed(PPC::CR2) ||
        MRI.isPhysRegUsed(PPC::CR3) ||
        MRI.isPhysRegUsed(PPC::CR4))) {
@@ -1106,10 +1109,10 @@
       unsigned Reg = CSI[i].getReg();
 
       if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
-	  // Leave Darwin logic as-is.
-	  || (!Subtarget.isSVR4ABI() &&
-	      (PPC::CRBITRCRegClass.contains(Reg) ||
-	       PPC::CRRCRegClass.contains(Reg)))) {
+          // Leave Darwin logic as-is.
+          || (!Subtarget.isSVR4ABI() &&
+              (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)))) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -1190,11 +1193,11 @@
   }
 }
 
-bool 
+bool
 PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-				     MachineBasicBlock::iterator MI,
-				     const std::vector<CalleeSavedInfo> &CSI,
-				     const TargetRegisterInfo *TRI) const {
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1207,7 +1210,7 @@
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
-  
+
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     // Only Darwin actually uses the VRSAVE register, but it can still appear
@@ -1237,21 +1240,21 @@
         CRSpilled = true;
         FuncInfo->setSpillsCR();
 
-	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
-	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
-	CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+        // 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+        // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+        CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
                   .addReg(Reg, RegState::ImplicitKill);
 
-	MBB.insert(MI, CRMIB);
-	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
-					 .addReg(PPC::R12,
-						 getKillRegState(true)),
-					 CSI[i].getFrameIdx()));
+        MBB.insert(MI, CRMIB);
+        MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+                                         .addReg(PPC::R12,
+                                                 getKillRegState(true)),
+                                         CSI[i].getFrameIdx()));
       }
     } else {
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, true,
-			      CSI[i].getFrameIdx(), RC, TRI);
+                              CSI[i].getFrameIdx(), RC, TRI);
     }
   }
   return true;
@@ -1260,8 +1263,8 @@
 static void
 restoreCRs(bool isPPC64, bool is31,
            bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
-	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
@@ -1275,12 +1278,12 @@
   else {
     // 32-bit:  FP-relative
     MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
-					     PPC::R12),
-				     CSI[CSIIndex].getFrameIdx()));
+                                             PPC::R12),
+                                     CSI[CSIIndex].getFrameIdx()));
     RestoreOp = PPC::MTOCRF;
     MoveReg = PPC::R12;
   }
-  
+
   if (CR2Spilled)
     MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
                .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
@@ -1335,11 +1338,11 @@
   MBB.erase(I);
 }
 
-bool 
+bool
 PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-					MachineBasicBlock::iterator MI,
-				        const std::vector<CalleeSavedInfo> &CSI,
-					const TargetRegisterInfo *TRI) const {
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1387,20 +1390,20 @@
       // When we first encounter a non-CR register after seeing at
       // least one CR register, restore all spilled CRs together.
       if ((CR2Spilled || CR3Spilled || CR4Spilled)
-	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         bool is31 = needsFP(*MF);
         restoreCRs(Subtarget.isPPC64(), is31,
                    CR2Spilled, CR3Spilled, CR4Spilled,
-		   MBB, I, CSI, CSIIndex);
-	CR2Spilled = CR3Spilled = CR4Spilled = false;
+                   MBB, I, CSI, CSIIndex);
+        CR2Spilled = CR3Spilled = CR4Spilled = false;
       }
 
       // Default behavior for non-CR saves.
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
-			       RC, TRI);
+                               RC, TRI);
       assert(I != MBB.begin() &&
-	     "loadRegFromStackSlot didn't insert any code!");
+             "loadRegFromStackSlot didn't insert any code!");
       }
 
     // Insert in reverse order.
@@ -1409,16 +1412,15 @@
     else {
       I = BeforeI;
       ++I;
-    }	    
+    }
   }
 
   // If we haven't yet spilled the CRs, do so now.
   if (CR2Spilled || CR3Spilled || CR4Spilled) {
-    bool is31 = needsFP(*MF); 
+    bool is31 = needsFP(*MF);
     restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
-	       MBB, I, CSI, CSIIndex);
+               MBB, I, CSI, CSIIndex);
   }
 
   return true;
 }
-
Index: lib/Target/Sparc/SparcFrameLowering.cpp
===================================================================
--- lib/Target/Sparc/SparcFrameLowering.cpp
+++ lib/Target/Sparc/SparcFrameLowering.cpp
@@ -109,18 +109,21 @@
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
       MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   // Emit ".cfi_window_save".
   CFIIndex = MMI.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
   unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
   // Emit ".cfi_register 15, 31".
   CFIIndex = MMI.addFrameInst(
       MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 void SparcFrameLowering::
Index: lib/Target/X86/X86FrameLowering.cpp
===================================================================
--- lib/Target/X86/X86FrameLowering.cpp
+++ lib/Target/X86/X86FrameLowering.cpp
@@ -225,7 +225,8 @@
   }
 }
 
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
+/// iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
@@ -257,13 +258,12 @@
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
-/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
-/// SUB.
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
+/// the stack adjustment is returned as a positive value for ADD/LEA and a
+/// negative for SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator &MBBI,
-                           unsigned StackPtr,
-                           bool doMergeWithPrevious) {
+                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
+                          bool doMergeWithPrevious) {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
@@ -369,7 +369,8 @@
     unsigned CFIIndex =
         MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
                                                         Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 }
 
@@ -514,7 +515,7 @@
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
@@ -522,7 +523,7 @@
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr,
                                          DwarfFramePtr, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -538,7 +539,7 @@
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -567,7 +568,7 @@
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
@@ -704,7 +705,7 @@
           MCCFIInstruction::createDefCfaOffset(nullptr,
                                                -StackSize + stackGrowth));
 
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -909,7 +910,8 @@
   }
 }
 
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
   const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1260,22 +1262,23 @@
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
 
-      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
       unsigned ScratchReg2;
       bool SaveScratch2;
       if (CompareStackPointer) {
-        // The primary scratch register is available for holding the TLS offset
+        // The primary scratch register is available for holding the TLS offset.
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
         SaveScratch2 = false;
       } else {
         // Need to use a second register to hold the TLS offset
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
 
-        // Unfortunately, with fastcc the second scratch register may hold an arg
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
         SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
       }
 
-      // If Scratch2 is live-in then it needs to be saved
+      // If Scratch2 is live-in then it needs to be saved.
       assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
              "Scratch register is live-in and not saved");
 
@@ -1352,14 +1355,14 @@
 /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
 ///
 /// CheckStack:
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 /// OldStart:
-///	  ...
+///       ...
 /// IncStack:
-///	  call inc_stack   # doubles the stack space
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
   const X86InstrInfo &TII = *TM.getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -5397,6 +5397,74 @@
   return V;
 }
 
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+                                     unsigned NonZeros, unsigned NumNonZero,
+                                     unsigned NumZero, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // We know there's at least one non-zero element
+  unsigned FirstNonZeroIdx = 0;
+  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+         X86::isZeroNode(FirstNonZero)) {
+    ++FirstNonZeroIdx;
+    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  }
+
+  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+    return SDValue();
+
+  SDValue V = FirstNonZero.getOperand(0);
+  unsigned FirstNonZeroDst = cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+    SDValue Elem = Op.getOperand(Idx);
+    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+      continue;
+
+    // TODO: What else can be here? Deal with it.
+    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // TODO: Some optimizations are still possible here
+    // ex: Getting one element from a vector, and the rest from another.
+    if (Elem.getOperand(0) != V)
+      return SDValue();
+
+    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+    if (Dst == Idx)
+      ++CorrectIdx;
+    else if (IncorrectIdx == -1U) {
+      IncorrectIdx = Idx;
+      IncorrectDst = Dst;
+    } else
+      // There was already one element with an incorrect index.
+      // We can't optimize this case to an insertps.
+      return SDValue();
+  }
+
+  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+    SDLoc dl(Op);
+    EVT VT = Op.getSimpleValueType();
+    unsigned ElementMoveMask = 0;
+    if (IncorrectIdx == -1U)
+      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+    else
+      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+    SDValue InsertpsMask = DAG.getIntPtrConstant(
+        ElementMoveMask | (~NonZeros & 0xf));
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  }
+
+  return SDValue();
+}
+
 /// getVShift - Return a vector logical shift node.
 ///
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -6148,6 +6216,14 @@
     if (V.getNode()) return V;
   }
 
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4) {
+    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+                                      NumZero, DAG, Subtarget, *this);
+    if (V.getNode())
+      return V;
+  }
+
   // If element VT is == 32 bits, turn it into a number of shuffles.
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
Index: lib/Target/XCore/XCoreFrameLowering.cpp
===================================================================
--- lib/Target/XCore/XCoreFrameLowering.cpp
+++ lib/Target/XCore/XCoreFrameLowering.cpp
@@ -64,7 +64,8 @@
                                MachineModuleInfo *MMI, unsigned DRegNum) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitDefCfaOffset(MachineBasicBlock &MBB,
@@ -73,7 +74,8 @@
                              MachineModuleInfo *MMI, int Offset) {
   unsigned CFIIndex =
       MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitCfiOffset(MachineBasicBlock &MBB,
@@ -82,7 +84,8 @@
                           unsigned DRegNum, int Offset) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 /// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
@@ -113,7 +116,8 @@
 /// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
 /// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
 /// \param OffsetFromTop the spill offset from the top of the frame.
-/// \param [in,out] RemainingAdj the current SP offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
 static void IfNeededLDAWSP(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc dl,
                            const TargetInstrInfo &TII, int OffsetFromTop,
@@ -346,7 +350,8 @@
   RemainingAdj /= 4;
 
   if (RetOpcode == XCore::EH_RETURN) {
-    // 'Restore' the exception info the unwinder has placed into the stack slots.
+    // 'Restore' the exception info the unwinder has placed into the stack
+    // slots.
     SmallVector<StackSlotInfo,2> SpillList;
     GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
@@ -514,7 +519,7 @@
       MBB.insert(I, New);
     }
   }
-  
+
   MBB.erase(I);
 }
 
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -256,7 +256,7 @@
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  size_t getNumRegs() const;
+  unsigned getNumRegs() const;
   Type *getType() const;
 
   void DeleteBaseReg(const SCEV *&S);
@@ -351,7 +351,7 @@
 /// getNumRegs - Return the total number of register operands used by this
 /// formula. This does not include register uses implied by non-constant
 /// addrec strides.
-size_t Formula::getNumRegs() const {
+unsigned Formula::getNumRegs() const {
   return !!ScaledReg + BaseRegs.size();
 }
 
@@ -4132,22 +4132,19 @@
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
 
-    // Ignore formulae which may not be ideal in terms of register reuse of
-    // ReqRegs.  The formula should use all required registers before
-    // introducing new ones.
-    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+    // Ignore formulae which do not use any of the required registers.
+    bool SatisfiedReqReg = true;
     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
          JE = ReqRegs.end(); J != JE; ++J) {
       const SCEV *Reg = *J;
-      if ((F.ScaledReg && F.ScaledReg == Reg) ||
-          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
+      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
+          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
           F.BaseRegs.end()) {
-        --NumReqRegsToFind;
-        if (NumReqRegsToFind == 0)
-          break;
+        SatisfiedReqReg = false;
+        break;
       }
     }
-    if (NumReqRegsToFind != 0) {
+    if (!SatisfiedReqReg) {
       // If none of the formulae satisfied the required registers, then we could
       // clear ReqRegs and try again. Currently, we simply give up in this case.
       continue;
Index: test/Analysis/BlockFrequencyInfo/irreducible.ll
===================================================================
--- test/Analysis/BlockFrequencyInfo/irreducible.ll
+++ test/Analysis/BlockFrequencyInfo/irreducible.ll
@@ -34,16 +34,28 @@
 !0 = metadata !{metadata !"branch_weights", i32 1, i32 7}
 !1 = metadata !{metadata !"branch_weights", i32 3, i32 4}
 
-; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances
-; into a loop very well.  The frequencies assigned to blocks in the loop are
-; predictable (and not absurd), but also not correct and therefore not worth
-; testing.
+; Irreducible control flow
+; ========================
 ;
-; There are two testcases below.
+; LoopInfo defines a loop as a non-trivial SCC dominated by a single block,
+; called the header.  A given loop, L, can have sub-loops, which are loops
+; within the subgraph of L that excludes the header.
 ;
-; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the
-; granularity of a single check.  If/when this behaviour is fixed, we'll know
-; about it, and the test should be updated.
+; In addition to loops, -block-freq has limited support for irreducible SCCs,
+; which are SCCs with multiple entry blocks.  Irreducible SCCs are discovered
+; on they fly, and modelled as loops with multiple headers.
+;
+; The headers of irreducible sub-SCCs consist of its entry blocks and all nodes
+; that are targets of a backedge within it (excluding backedges within true
+; sub-loops).
+;
+; -block-freq is currently designed to act like a block is inserted that
+; intercepts all the edges to the headers.  All backedges and entries point to
+; this block.  Its successors are the headers, which split the frequency
+; evenly.
+;
+; There are a number of testcases below.  Only the first two have detailed
+; explanations.
 ;
 ; Testcase #1
 ; ===========
@@ -77,36 +89,31 @@
 ; loop as a whole is 1/4, so the loop scale should be 4.  Summing c1 and c2
 ; gives 28/7, or 4.0, which is nice confirmation of the math above.
 ;
-; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
-; returns 3/4 and 13/16, respectively.  LoopInfo ignores edges between loops
-; (and doesn't see any loops here at all), and -block-freq ignores the
-; irreducible edge from c2 to c1.
-;
+; -block-freq currently treats the two nodes as equals.
+define void @multientry(i1 %x) {
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry':
 ; CHECK-NEXT: block-frequency-info: multientry
-define void @multientry(i1 %x) {
-; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
   br i1 %x, label %c1, label %c2, !prof !2
 
-; This is like a single-line XFAIL (see above).
-; CHECK-NEXT: c1:
-; CHECK-NOT: float = 2.142857{{[0-9]*}},
 c1:
+; CHECK-NEXT: c1: float = 2.0,
+; The "correct" answer is: float = 2.142857{{[0-9]*}},
   br i1 %x, label %c2, label %exit, !prof !2
 
-; This is like a single-line XFAIL (see above).
-; CHECK-NEXT: c2:
-; CHECK-NOT: float = 1.857142{{[0-9]*}},
 c2:
+; CHECK-NEXT: c2: float = 2.0,
+; The "correct" answer is: float = 1.857142{{[0-9]*}},
   br i1 %x, label %c1, label %exit, !prof !2
 
-; We still shouldn't lose any frequency.
-; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
 exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
 
+!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
 ; Testcase #2
 ; ===========
 ;
@@ -124,73 +131,291 @@
 ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined.  This
 ; infinite series sums to 1.
 ;
-; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
-; returns 1/2 and 3/4, respectively.  LoopInfo ignores edges between loops (and
-; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible
-; edge from c2 to c1.
-;
-; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a
-; single check.  If/when this behaviour is fixed, we'll know about it, and the
-; test should be updated.
-;
+; Since the currently algorithm *always* assumes entry blocks are equal,
+; -block-freq gets the right answers here.
+define void @crossloops(i2 %x) {
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
 ; CHECK-NEXT: block-frequency-info: crossloops
-define void @crossloops(i2 %x) {
-; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
   switch i2 %x, label %exit [ i2 1, label %c1
                               i2 2, label %c2 ], !prof !3
 
-; This is like a single-line XFAIL (see above).
-; CHECK-NEXT: c1:
-; CHECK-NOT: float = 1.0,
 c1:
+; CHECK-NEXT: c1: float = 1.0,
   switch i2 %x, label %exit [ i2 1, label %c1
                               i2 2, label %c2 ], !prof !3
 
-; This is like a single-line XFAIL (see above).
-; CHECK-NEXT: c2:
-; CHECK-NOT: float = 1.0,
 c2:
+; CHECK-NEXT: c2: float = 1.0,
   switch i2 %x, label %exit [ i2 1, label %c1
                               i2 2, label %c2 ], !prof !3
 
-; We still shouldn't lose any frequency.
-; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
 exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
 
-!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
 !3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
 
-; A reducible loop with irreducible control flow inside should still have
-; correct exit frequency.
-;
+; A true loop with irreducible control flow inside.
+define void @loop_around_irreducible(i1 %x) {
 ; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_irreducible':
 ; CHECK-NEXT: block-frequency-info: loop_around_irreducible
-define void @loop_around_irreducible(i1 %x) {
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 4.0, int = [[HEAD:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !4
+
+left:
+; CHECK-NEXT: left: float = 8.0,
+  br i1 %x, label %right, label %loop.end, !prof !5
+
+right:
+; CHECK-NEXT: right: float = 8.0,
+  br i1 %x, label %left, label %loop.end, !prof !5
+
+loop.end:
+; CHECK-NEXT: loop.end: float = 4.0, int = [[HEAD]]
+  br i1 %x, label %loop, label %exit, !prof !5
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!4 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+; Two unrelated irreducible SCCs.
+define void @two_sccs(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'two_sccs':
+; CHECK-NEXT: block-frequency-info: two_sccs
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %a, label %b, !prof !6
+
+a:
+; CHECK-NEXT: a: float = 0.75,
+  br i1 %x, label %a.left, label %a.right, !prof !7
+
+a.left:
+; CHECK-NEXT: a.left: float = 1.5,
+  br i1 %x, label %a.right, label %exit, !prof !6
+
+a.right:
+; CHECK-NEXT: a.right: float = 1.5,
+  br i1 %x, label %a.left, label %exit, !prof !6
+
+b:
+; CHECK-NEXT: b: float = 0.25,
+  br i1 %x, label %b.left, label %b.right, !prof !7
+
+b.left:
+; CHECK-NEXT: b.left: float = 0.625,
+  br i1 %x, label %b.right, label %exit, !prof !8
+
+b.right:
+; CHECK-NEXT: b.right: float = 0.625,
+  br i1 %x, label %b.left, label %exit, !prof !8
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!6 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!7 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!8 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+
+; A true loop inside irreducible control flow.
+define void @loop_inside_irreducible(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_inside_irreducible':
+; CHECK-NEXT: block-frequency-info: loop_inside_irreducible
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !9
+
+left:
+; CHECK-NEXT: left: float = 2.0,
+  br i1 %x, label %right, label %exit, !prof !10
+
+right:
+; CHECK-NEXT: right: float = 2.0, int = [[RIGHT:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 6.0,
+  br i1 %x, label %loop, label %right.end, !prof !11
+
+right.end:
+; CHECK-NEXT: right.end: float = 2.0, int = [[RIGHT]]
+  br i1 %x, label %left, label %exit, !prof !10
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!9 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!10 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!11 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+
+; Irreducible control flow in a branch that's in a true loop.
+define void @loop_around_branch_with_irreducible(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_branch_with_irreducible':
+; CHECK-NEXT: block-frequency-info: loop_around_branch_with_irreducible
+entry:
 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 2.0, int = [[LOOP:[0-9]+]]
+  br i1 %x, label %normal, label %irreducible.entry, !prof !12
+
+normal:
+; CHECK-NEXT: normal: float = 1.5,
+  br label %loop.end
+
+irreducible.entry:
+; CHECK-NEXT: irreducible.entry: float = 0.5, int = [[IRREDUCIBLE:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !13
+
+left:
+; CHECK-NEXT: left: float = 1.0,
+  br i1 %x, label %right, label %irreducible.exit, !prof !12
+
+right:
+; CHECK-NEXT: right: float = 1.0,
+  br i1 %x, label %left, label %irreducible.exit, !prof !12
+
+irreducible.exit:
+; CHECK-NEXT: irreducible.exit: float = 0.5, int = [[IRREDUCIBLE]]
+  br label %loop.end
+
+loop.end:
+; CHECK-NEXT: loop.end: float = 2.0, int = [[LOOP]]
+  br i1 %x, label %loop, label %exit, !prof !13
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!12 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!13 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+
+; Irreducible control flow between two true loops.
+define void @loop_around_branch_with_irreducible_around_loop(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_branch_with_irreducible_around_loop':
+; CHECK-NEXT: block-frequency-info: loop_around_branch_with_irreducible_around_loop
 entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
   br label %loop
 
-; CHECK-NEXT: loop: float = [[HEAD:[0-9.]+]], int = [[HEADINT:[0-9]+]]
 loop:
-  br i1 %x, label %left, label %right
+; CHECK-NEXT: loop: float = 3.0, int = [[LOOP:[0-9]+]]
+  br i1 %x, label %normal, label %irreducible, !prof !14
+
+normal:
+; CHECK-NEXT: normal: float = 2.0,
+  br label %loop.end
+
+irreducible:
+; CHECK-NEXT: irreducible: float = 1.0,
+  br i1 %x, label %left, label %right, !prof !15
 
-; CHECK-NEXT: left:
 left:
-  br i1 %x, label %right, label %loop.end
+; CHECK-NEXT: left: float = 2.0,
+  br i1 %x, label %right, label %loop.end, !prof !16
 
-; CHECK-NEXT: right:
 right:
-  br i1 %x, label %left, label %loop.end
+; CHECK-NEXT: right: float = 2.0, int = [[RIGHT:[0-9]+]]
+  br label %right.loop
+
+right.loop:
+; CHECK-NEXT: right.loop: float = 10.0,
+  br i1 %x, label %right.loop, label %right.end, !prof !17
+
+right.end:
+; CHECK-NEXT: right.end: float = 2.0, int = [[RIGHT]]
+  br i1 %x, label %left, label %loop.end, !prof !16
 
-; CHECK-NEXT: loop.end: float = [[HEAD]], int = [[HEADINT]]
 loop.end:
-  br i1 %x, label %loop, label %exit
+; CHECK-NEXT: loop.end: float = 3.0, int = [[LOOP]]
+  br i1 %x, label %loop, label %exit, !prof !14
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!14 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!15 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!16 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!17 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+
+; An irreducible SCC with a non-header.
+define void @nonheader(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nonheader':
+; CHECK-NEXT: block-frequency-info: nonheader
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !18
+
+left:
+; CHECK-NEXT: left: float = 1.0,
+  br i1 %x, label %bottom, label %exit, !prof !19
+
+right:
+; CHECK-NEXT: right: float = 1.0,
+  br i1 %x, label %bottom, label %exit, !prof !20
+
+bottom:
+; CHECK-NEXT: bottom: float = 1.0,
+  br i1 %x, label %left, label %right, !prof !18
 
-; CHECK-NEXT: float = 1.0, int = [[ENTRY]]
 exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!18 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!19 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!20 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+; An irreducible SCC with an irreducible sub-SCC.  In the current version of
+; -block-freq, this means an extra header.
+;
+; This testcases uses non-trivial branch weights.  The CHECK statements here
+; will start to fail if we change -block-freq to be more accurate.  Currently,
+; we expect left, right and top to be treated as equal headers.
+define void @nonentry_header(i1 %x, i2 %y) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header':
+; CHECK-NEXT: block-frequency-info: nonentry_header
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !21
+
+left:
+; CHECK-NEXT: left: float = 3.0,
+  br i1 %x, label %top, label %bottom, !prof !22
+
+right:
+; CHECK-NEXT: right: float = 3.0,
+  br i1 %x, label %top, label %bottom, !prof !22
+
+top:
+; CHECK-NEXT: top: float = 3.0,
+  switch i2 %y, label %exit [ i2 0, label %left
+                              i2 1, label %right
+                              i2 2, label %bottom ], !prof !23
+
+bottom:
+; CHECK-NEXT: bottom: float = 4.5,
+  br label %top
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
   ret void
 }
+!21 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!22 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!23 = metadata !{metadata !"branch_weights", i32 8, i32 1, i32 3, i32 12}
Index: test/CodeGen/ARM64/ext.ll
===================================================================
--- test/CodeGen/ARM64/ext.ll
+++ test/CodeGen/ARM64/ext.ll
@@ -65,6 +65,15 @@
 	ret <8 x i8> %tmp3
 }
 
+define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef2:
+;CHECK: {{ext.8b.*#6}}
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %tmp3
+}
+
 define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextRq_undef:
 ;CHECK: {{ext.16b.*#7}}
@@ -74,6 +83,14 @@
 	ret <16 x i8> %tmp3
 }
 
+define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vextRq_undef2:
+;CHECK: {{ext.16b.*#10}}
+  %tmp1 = load <8 x i16>* %A
+  %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %vext;
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
Index: test/CodeGen/X86/sse41.ll
===================================================================
--- test/CodeGen/X86/sse41.ll
+++ test/CodeGen/X86/sse41.ll
@@ -320,3 +320,259 @@
   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   ret <4 x i32> %result
 }
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext2 = extractelement <4 x float> %x, i32 3
+  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 3
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i32> %x, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext2 = extractelement <4 x i32> %x, i32 3
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 3
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps  %xmm0, %xmm1
+; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps   %xmm1, %xmm0
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  %mask = fcmp olt <4 x float> %vecinit5, %x
+  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+  ret <4 x float> %res
+}
Index: test/DebugInfo/X86/inline-member-function.ll
===================================================================
--- /dev/null
+++ test/DebugInfo/X86/inline-member-function.ll
@@ -0,0 +1,90 @@
+; REQUIRES: object-emission
+
+; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; From source:
+; struct foo {
+;   int __attribute__((always_inline)) func(int x) { return x + 2; }
+; };
+
+; int i;
+
+; int main() {
+;   return foo().func(i);
+; }
+
+; Ensure we omit DW_AT_object_pointer on inlined subroutines.
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABSTRACT_ORIGIN:0x[0-9a-e]*]]}
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_AT_object_pointer
+; CHECK: DW_TAG
+
+; But make sure we emit DW_AT_object_pointer on the abstract definition.
+; CHECK: [[ABSTRACT_ORIGIN]]: DW_TAG_subprogram
+; CHECK-NOT: NULL
+; CHECK-NOT: TAG
+; CHECK: DW_AT_object_pointer
+
+%struct.foo = type { i8 }
+
+@i = global i32 0, align 4
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %this.addr.i = alloca %struct.foo*, align 8
+  %x.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  %tmp = alloca %struct.foo, align 1
+  store i32 0, i32* %retval
+  %0 = load i32* @i, align 4, !dbg !23
+  store %struct.foo* %tmp, %struct.foo** %this.addr.i, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr.i}, metadata !24), !dbg !26
+  store i32 %0, i32* %x.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !27), !dbg !28
+  %this1.i = load %struct.foo** %this.addr.i
+  %1 = load i32* %x.addr.i, align 4, !dbg !28
+  %add.i = add nsw i32 %1, 2, !dbg !28
+  ret i32 %add.i, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20, !21}
+!llvm.ident = !{!22}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !18, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"inline.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !11, i32 2} ; [ DW_TAG_subprogram ] [line 2] [func]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!11 = metadata !{i32 786468}
+!12 = metadata !{metadata !13, metadata !17}
+!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !9}
+!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, metadata !6, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"", metadata !14, i32 5, metadata !9, i32 0, i32 1, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
+!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{metadata !"clang version 3.5.0 "}
+!23 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!24 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !25, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!26 = metadata !{i32 0, i32 0, metadata !17, metadata !23}
+!27 = metadata !{i32 786689, metadata !17, metadata !"x", metadata !14, i32 33554434, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 2]
+!28 = metadata !{i32 2, i32 0, metadata !17, metadata !23}
Index: test/MC/ELF/offset.s
===================================================================
--- test/MC/ELF/offset.s
+++ test/MC/ELF/offset.s
@@ -71,3 +71,62 @@
 // CHECK-NEXT:    Other: 0
 // CHECK-NEXT:    Section: .data
 // CHECK-NEXT:  }
+
+
+        .globl test2_a
+        .globl test2_b
+        .globl test2_c
+        .globl test2_d
+        .globl test2_e
+test2_a:
+    .long 0
+test2_b = test2_a
+test2_c:
+    .long 0
+test2_d = test2_c
+test2_e = test2_d - test2_b
+// CHECK:      Symbol {
+// CHECK:        Name: test2_a
+// CHECK-NEXT:   Value: 0x5
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_b
+// CHECK-NEXT:   Value: 0x5
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_c
+// CHECK-NEXT:   Value: 0x9
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_d
+// CHECK-NEXT:   Value: 0x9
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_e
+// CHECK-NEXT:   Value: 0x4
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: Absolute
+// CHECK-NEXT: }
Index: test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
===================================================================
--- test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
+++ test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.ll' '.c']
+config.suffixes = ['.ll']
 
 targets = set(config.root.targets_to_build.split())
 if not 'ARM64' in targets:
Index: test/Transforms/LoopStrengthReduce/ARM64/req-regs.c
===================================================================
--- test/Transforms/LoopStrengthReduce/ARM64/req-regs.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: clang %s -O3 -target arm64-apple-ios -o - -S -mllvm -debug-only=loop-reduce 2>&1| FileCheck %s
-// REQUIRES: asserts
-
-// LSR used to fail here due to a bug in the ReqRegs test.  To complicate
-// things, this could only be reproduced with clang because the uses would
-// come out in different order when invoked through llc.
-
-// CHECK: The chosen solution requires
-// CHECK-NOT: No Satisfactory Solution
-
-typedef unsigned long iter_t;
-void use_int(int result);
-
-struct _state {
- int N;
- int M;
- int K;
- double* data;
-};
-void
-do_integer_add(iter_t iterations, void* cookie)
-{
-    struct _state *pState = (struct _state*)cookie;
-    register int i;
-    register int a = pState->N + 57;
-
-    while (iterations-- > 0) {
-        for (i = 1; i < 1001; ++i) {
-          a=a+a+i; a=a+a+i; a=a+a+i; a=a+a+i;
-          a=a+a+i; a=a+a+i; a=a+a+i; a=a+a+i;
-          a=a+a+i; a=a+a+i;
-
-        }
-    }
-    use_int(a);
-}
Index: unittests/ADT/PointerUnionTest.cpp
===================================================================
--- unittests/ADT/PointerUnionTest.cpp
+++ unittests/ADT/PointerUnionTest.cpp
@@ -13,22 +13,24 @@
 
 namespace {
 
-typedef PointerUnion<int*, float*> PU;
+typedef PointerUnion<int *, float *> PU;
 
-// Test fixture
-class PointerUnionTest : public testing::Test {
-};
+struct PointerUnionTest : public testing::Test {
+  float f;
+  int i;
 
-float f = 3.14f;
-int i = 42;
+  PU a, b, c, n;
 
-const PU a(&f);
-const PU b(&i);
-const PU n;
+  PointerUnionTest() : f(3.14f), i(42), a(&f), b(&i), c(&i), n() {}
+};
 
 TEST_F(PointerUnionTest, Comparison) {
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a != a);
   EXPECT_TRUE(a != b);
   EXPECT_FALSE(a == b);
+  EXPECT_TRUE(b == c);
+  EXPECT_FALSE(b != c);
   EXPECT_TRUE(b != n);
   EXPECT_FALSE(b == n);
 }
@@ -44,21 +46,27 @@
   EXPECT_TRUE((bool)a);
   EXPECT_TRUE((bool)b);
   EXPECT_FALSE(n);
+
+  EXPECT_NE(n, b);
+  EXPECT_EQ(b, c);
+  b = nullptr;
+  EXPECT_EQ(n, b);
+  EXPECT_NE(b, c);
 }
 
 TEST_F(PointerUnionTest, Is) {
-  EXPECT_FALSE(a.is<int*>());
-  EXPECT_TRUE(a.is<float*>());
-  EXPECT_TRUE(b.is<int*>());
-  EXPECT_FALSE(b.is<float*>());
-  EXPECT_TRUE(n.is<int*>());
-  EXPECT_FALSE(n.is<float*>());
+  EXPECT_FALSE(a.is<int *>());
+  EXPECT_TRUE(a.is<float *>());
+  EXPECT_TRUE(b.is<int *>());
+  EXPECT_FALSE(b.is<float *>());
+  EXPECT_TRUE(n.is<int *>());
+  EXPECT_FALSE(n.is<float *>());
 }
 
 TEST_F(PointerUnionTest, Get) {
-  EXPECT_EQ(a.get<float*>(), &f);
-  EXPECT_EQ(b.get<int*>(), &i);
-  EXPECT_EQ(n.get<int*>(), (int*)0);
+  EXPECT_EQ(a.get<float *>(), &f);
+  EXPECT_EQ(b.get<int *>(), &i);
+  EXPECT_EQ(n.get<int *>(), (int *)0);
 }
 
 } // end anonymous namespace