Index: include/llvm/ADT/PostOrderIterator.h
===================================================================
--- include/llvm/ADT/PostOrderIterator.h
+++ include/llvm/ADT/PostOrderIterator.h
@@ -296,12 +296,15 @@
 
 public:
   using rpo_iterator = typename std::vector<NodeRef>::reverse_iterator;
+  using const_rpo_iterator = typename std::vector<NodeRef>::const_reverse_iterator;
 
   ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); }
 
   // Because we want a reverse post order, use reverse iterators from the vector
   rpo_iterator begin() { return Blocks.rbegin(); }
+  const_rpo_iterator begin() const { return Blocks.crbegin(); }
   rpo_iterator end() { return Blocks.rend(); }
+  const_rpo_iterator end() const { return Blocks.crend(); }
 };
 
 } // end namespace llvm
Index: include/llvm/Analysis/DivergenceAnalysis.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/DivergenceAnalysis.h
@@ -0,0 +1,245 @@
+//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class Module;
+class Value;
+class Instruction;
+class Loop;
+class raw_ostream;
+class TargetTransformInfo;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+  /// \brief This instance will analyze the whole function \p or the loop \p
+  /// RegionLoop.
+  ///
+  /// \param RegionLoop If non-null the analysis is restricted to \p RegionLoop.
+  /// Otherwise the whole function is analyzed.
+  /// \param IsLCSSAFOrm whether the analysis may assume that the IR in the
+  /// region in in LCSSA form.
+  DivergenceAnalysis(const Function &F, const Loop *RegionLoop,
+                     const DominatorTree &DT, const LoopInfo &LI,
+                     SyncDependenceAnalysis &SDA, bool IsLCSSAForm);
+
+  /// \brief the loop that defines the analyzed region (if any).
+  const Loop *getRegionLoop() const { return RegionLoop; }
+  const Function &getFunction() const { return F; }
+
+  /// \brief Whether \p BB is part of the region.
+  bool inRegion(const BasicBlock &BB) const;
+  /// \brief Whether \p I is part of the region.
+  bool inRegion(const Instruction &I) const;
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const Value &UniVal);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  void markDivergent(const Value &DivVal);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const Value &Val) const;
+
+  /// \brief Whether \p Val is a divergent value
+  bool isDivergent(const Value &Val) const;
+
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  bool updateTerminator(const TerminatorInst &Term) const;
+  bool updatePHINode(const PHINode &Phi) const;
+
+  /// \returns whether \p Inst should be considered divergent based on the
+  /// divergence of its operands.
+  ///
+  /// This should only be called for non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const Instruction &Inst) const;
+
+  /// \brief mark users of live-out users as divergent.
+  ///
+  /// \param LoopHeader the header of the divergent loop.
+  ///
+  /// marks all users of live-out values of the loop headed by @loopHeader
+  /// as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const BasicBlock &LoopHeader);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist
+  void pushUsers(const Value &I);
+
+  /// \brief Push all phi nodes in @block to the worklist
+  void pushPHINodes(const BasicBlock &Block);
+
+  /// \brief Mark \p Block as join divergent
+  ///
+  /// A block is join divergent if two threads may reach it from different
+  /// incoming blocks at the same time.
+  void markBlockJoinDivergent(const BasicBlock &Block) {
+    DivergentJoinBlocks.insert(&Block);
+  }
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const BasicBlock &ObservingBlock,
+                           const Value &Val) const;
+
+  /// \brief Whether \p Block is join divergent
+  ///
+  /// (see markBlockJoinDivergent).
+  bool isJoinDivergent(const BasicBlock &Block) const {
+    return DivergentJoinBlocks.count(&Block);
+  }
+
+  /// \brief Propagate control-induced divergence to users (phi nodes and
+  /// instructions).
+  //
+  // \param JoinBlock is a divergent loop exit or join point of two disjoint
+  // paths.
+  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+  bool propagateJoinDivergence(const BasicBlock &JoinBlock,
+                               const Loop *TermLoop);
+
+  /// \brief Propagate induced value divergence due to control divergence in \p
+  /// Term.
+  void propagateBranchDivergence(const TerminatorInst &Term);
+
+  /// \brief Propagate divergent caused by a divergent loop exit.
+  ///
+  /// \param ExitingLoop is a divergent loop.
+  void propagateLoopDivergence(const Loop &ExitingLoop);
+
+private:
+  const Function &F;
+  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+  // Otw, analyze the whole function
+  const Loop *RegionLoop;
+
+  const DominatorTree &DT;
+  const LoopInfo &LI;
+
+  // Recognized divergent loops
+  DenseSet<const Loop *> DivergentLoops;
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysis &SDA;
+
+  // Use simplified code path for LCSSA form.
+  bool IsLCSSAForm;
+
+  // Set of known-uniform values.
+  DenseSet<const Value *> UniformOverrides;
+
+  // Blocks with joining divergent control from different predecessors.
+  DenseSet<const BasicBlock *> DivergentJoinBlocks;
+
+  // Detected/marked divergent values.
+  DenseSet<const Value *> DivergentValues;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const Instruction *> Worklist;
+};
+
+/// \brief Divergence analysis frontend for loops.
+class LoopDivergenceAnalysis {
+public:
+  LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI,
+                         SyncDependenceAnalysis &SDA, const Loop &loop);
+
+  /// Whether \p V is divergent.
+  bool isDivergent(const Value &V) const;
+
+  /// Whether \p V is uniform/non-divergent.
+  bool isUniform(const Value &V) const { return !isDivergent(V); }
+
+  /// Print all divergent values in the loop.
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  DivergenceAnalysis DA;
+};
+
+/// \brief Loop divergence printer pass.
+/// This is intended for use in LIT testing.
+class LoopDivergencePrinter : public FunctionPass {
+public:
+  static char ID;
+
+  LoopDivergencePrinter() : FunctionPass(ID) {
+    initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Analyze all loop-divergence of all loops in @F and print the results.
+  bool runOnFunction(Function &F) override;
+
+  /// Print all divergent values in the function.
+  void print(raw_ostream &OS, const Module *) const override;
+
+private:
+  std::unique_ptr<SyncDependenceAnalysis> SDA;
+  SmallVector<std::unique_ptr<LoopDivergenceAnalysis>, 6> LoopDivInfo;
+};
+
+/// \brief Divergence analysis frontend for GPU kernels.
+class GPUDivergenceAnalysis {
+  SyncDependenceAnalysis SDA;
+  DivergenceAnalysis DA;
+
+public:
+  /// Runs the divergence analysis on @F, a GPU kernel
+  GPUDivergenceAnalysis(Function &F, const DominatorTree &DT,
+                        const PostDominatorTree &PDT, const LoopInfo &LI,
+                        const TargetTransformInfo &TTI);
+
+  /// Whether any divergence was detected.
+  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+  /// The GPU kernel this analysis result is for
+  const Function &getFunction() const { return DA.getFunction(); }
+
+  /// Whether \p V is divergent.
+  bool isDivergent(const Value &V) const;
+
+  /// Whether \p V is uniform/non-divergent
+  bool isUniform(const Value &V) const { return !isDivergent(V); }
+
+  /// Print all divergent values in the kernel.
+  void print(raw_ostream &OS, const Module *) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
Index: include/llvm/Analysis/LegacyDivergenceAnalysis.h
===================================================================
--- include/llvm/Analysis/LegacyDivergenceAnalysis.h
+++ include/llvm/Analysis/LegacyDivergenceAnalysis.h
@@ -19,9 +19,11 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 
 namespace llvm {
 class Value;
+class GPUDivergenceAnalysis;
 class LegacyDivergenceAnalysis : public FunctionPass {
 public:
   static char ID;
@@ -41,7 +43,7 @@
   //
   // Even if this function returns false, V may still be divergent when used
   // in a different basic block.
-  bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
+  bool isDivergent(const Value *V) const;
 
   // Returns true if V is uniform/non-divergent.
   //
@@ -53,6 +55,12 @@
   void removeValue(const Value *V) { DivergentValues.erase(V); }
 
 private:
+  // Whether analysis should be performed by GPUDivergenceAnalysis.
+  bool shouldUseGPUDivergenceAnalysis(const Function &F) const;
+
+  // (optional) handle to new DivergenceAnalysis
+  std::unique_ptr<GPUDivergenceAnalysis> gpuDA;
+
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
 };
Index: include/llvm/Analysis/Passes.h
===================================================================
--- include/llvm/Analysis/Passes.h
+++ include/llvm/Analysis/Passes.h
@@ -66,6 +66,13 @@
   //
   FunctionPass *createLegacyDivergenceAnalysisPass();
 
+  //===--------------------------------------------------------------------===//
+  //
+  // createLoopDivergencePrinterPass - This pass determines which branches and
+  // instructions in a loop are divergent.
+  //
+  FunctionPass *createLoopDivergencePrinterPass();
+
   //===--------------------------------------------------------------------===//
   //
   // Minor pass prototypes, allowing us to expose them through bugpoint and
Index: include/llvm/Analysis/SyncDependenceAnalysis.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -0,0 +1,88 @@
+//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include <memory>
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Loop;
+class PostDominatorTree;
+class TerminatorInst;
+class TerminatorInst;
+
+using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+  void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop,
+                      const BasicBlock *defBlock);
+
+public:
+  bool inRegion(const BasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT,
+                         const LoopInfo &LI);
+
+  /// \brief Computes divergent join points and loop exits caused by divergent
+  /// in \p Term.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from \p Term.
+  /// The set also contains loop exits if there two disjoin paths:
+  /// one from \p Term to the loop exit and another from @term to the loop
+  /// header. Those exit blocks are added to the returned set. If L is the
+  /// parent loop of @term and an exit of L is in the returned set then L is a
+  /// divergent loop.
+  const ConstBlockSet &join_blocks(const TerminatorInst &term);
+
+  /// \brief Computes divergent join points and loop exits caused by the
+  /// divergent loop \p Loop.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from the
+  /// loop exits of @loop.
+  /// This treats the loop as a single node in @loop's parent loop.
+  /// The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const Loop &Loop);
+
+private:
+  static ConstBlockSet EmptyBlockSet;
+
+  ReversePostOrderTraversal<const Function *> FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  std::map<const Loop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const TerminatorInst *, std::unique_ptr<ConstBlockSet>>
+      CachedBranchJoins;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -211,6 +211,7 @@
 void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
 void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
+void initializeLoopDivergencePrinterPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopGuardWideningLegacyPassPass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -122,6 +122,7 @@
       (void) llvm::createLCSSAPass();
       (void) llvm::createLegacyDivergenceAnalysisPass();
       (void) llvm::createLICMPass();
+      (void) llvm::createLoopDivergencePrinterPass();
       (void) llvm::createLoopSinkPass();
       (void) llvm::createLazyValueInfoPass();
       (void) llvm::createLoopExtractorPass();
Index: lib/Analysis/Analysis.cpp
===================================================================
--- lib/Analysis/Analysis.cpp
+++ lib/Analysis/Analysis.cpp
@@ -59,6 +59,7 @@
   initializeLazyValueInfoPrinterPass(Registry);
   initializeLegacyDivergenceAnalysisPass(Registry);
   initializeLintPass(Registry);
+  initializeLoopDivergencePrinterPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
   initializeMemDerefPrinterPass(Registry);
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@
   Delinearization.cpp
   DemandedBits.cpp
   DependenceAnalysis.cpp
+  DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   EHPersonalities.cpp
@@ -79,6 +80,7 @@
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
+  SyncDependenceAnalysis.cpp
   SyntheticCountsUtils.cpp
   TargetLibraryInfo.cpp
   TargetTransformInfo.cpp
Index: lib/Analysis/DivergenceAnalysis.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/DivergenceAnalysis.cpp
@@ -0,0 +1,537 @@
+//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines whether a branch in a loop or GPU program
+// is divergent. It can help branch optimizations such as jump threading and
+// loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they converge at the immediately post-dominating BB of the paths.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can be unfortunately harmful when performed on
+// divergent branches. Therefore, an analysis that computes which branches in a
+// GPU program are divergent can help the compiler to selectively run these
+// optimizations.
+//
+// This file defines divergence analysis which computes a conservative but
+// non-trivial approximation of all divergent branches in a GPU program. This
+// implementation is derived from the Vectorization Analysis of the Region
+// Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
+//
+// The DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+//    non-kernel-entry function and the return value of a function call as
+//    divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+//    generic or local address as divergent. This can be improved by leveraging
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "divergence-analysis"
+
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
+    const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
+    : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
+      IsLCSSAForm(IsLCSSAForm) {}
+
+void DivergenceAnalysis::markDivergent(const Value &DivVal) {
+  assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
+  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+  DivergentValues.insert(&DivVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
+  UniformOverrides.insert(&UniVal);
+}
+
+bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const {
+  if (Term.getNumSuccessors() <= 1)
+    return false;
+  if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
+    assert(BranchTerm->isConditional());
+    return isDivergent(*BranchTerm->getCondition());
+  }
+  if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
+    return isDivergent(*SwitchTerm->getCondition());
+  }
+  if (isa<InvokeInst>(Term)) {
+    return false; // ignore abnormal executions through landingpad
+  }
+
+  abort();
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
+  // TODO function calls with side effects, etc
+  for (const auto &Op : I.operands()) {
+    if (isDivergent(*Op))
+      return true;
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
+                                             const Value &Val) const {
+  const auto *Inst = dyn_cast<const Instruction>(&Val);
+  if (!Inst)
+    return false;
+  // check whether any divergent loop carrying @val terminates before control
+  // proceeds to @observingBlock
+  for (const auto *Loop = LI.getLoopFor(Inst->getParent());
+       Loop != RegionLoop && !Loop->contains(&ObservingBlock);
+       Loop = Loop->getParentLoop()) {
+    if (DivergentLoops.find(Loop) != DivergentLoops.end())
+      return true;
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
+  // joining divergent disjoint path in @phi parent block
+  if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i % thread_id == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
+    const auto *InVal = Phi.getIncomingValue(i);
+    if (isDivergent(*Phi.getIncomingValue(i)) ||
+        isTemporalDivergent(*Phi.getParent(), *InVal)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::inRegion(const Instruction &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
+  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by @loopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
+  auto *DivLoop = LI.getLoopFor(&LoopHeader);
+  assert(DivLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<BasicBlock *, 8> TaintStack;
+  DivLoop->getExitBlocks(TaintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of @DivLoop (including its fringes for phi nodes)
+  DenseSet<const BasicBlock *> Visited;
+  for (auto *Block : TaintStack) {
+    Visited.insert(Block);
+  }
+  Visited.insert(&LoopHeader);
+
+  while (!TaintStack.empty()) {
+    auto *UserBlock = TaintStack.back();
+    TaintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*UserBlock))
+      continue;
+
+    assert(!DivLoop->contains(UserBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&LoopHeader, UserBlock)) {
+      // all PHI nodes of @userBlock become divergent
+      for (auto &Phi : UserBlock->phis()) {
+        Worklist.push_back(&Phi);
+      }
+      continue;
+    }
+
+    // taint outside users of values carried by @DivLoop
+    for (auto &I : *UserBlock) {
+      if (isAlwaysUniform(I))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(&Op);
+        if (!OpInst)
+          continue;
+        if (DivLoop->contains(OpInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *SuccBlock : successors(UserBlock)) {
+      if (!Visited.insert(SuccBlock).second) {
+        continue;
+      }
+      TaintStack.push_back(SuccBlock);
+    }
+  }
+}
+
+void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
+  for (const auto &Phi : Block.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    Worklist.push_back(&Phi);
+  }
+}
+
+void DivergenceAnalysis::pushUsers(const Value &V) {
+  for (const auto *User : V.users()) {
+    const auto *UserInst = dyn_cast<const Instruction>(User);
+    if (!UserInst)
+      continue;
+
+    if (isDivergent(*UserInst))
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(*UserInst))
+      continue;
+    Worklist.push_back(UserInst);
+  }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
+                                                 const Loop *BranchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(JoinBlock)) {
+    return false;
+  }
+
+  // push non-divergent phi nodes in @joinBlock to the worklist
+  pushPHINodes(JoinBlock);
+
+  // @joinBlock is a divergent loop exit
+  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+    return true;
+  }
+
+  // disjoint-paths divergent at @joinBlock
+  markBlockJoinDivergent(JoinBlock);
+  return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+  markDivergent(Term);
+
+  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+  // whether there is a divergent loop exit from @branchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from @term within the loop
+  // also iterates over loop exits that become divergent due to @term.
+  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // @branch loop is a divergent loop due to the divergent branch in @term
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*ExitingLoop.getHeader()))
+    return;
+
+  const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from @branchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // @exitingLoop also iterates over loop exits (of @branchLoop) that in turn
+  // become divergent.
+  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // @branch loop is a divergent due to divergent loop exit in @exitingLoop
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::compute() {
+  for (auto *DivVal : DivergentValues) {
+    pushUsers(*DivVal);
+  }
+
+  // propagate divergence
+  while (!Worklist.empty()) {
+    const Instruction &I = *Worklist.back();
+    Worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniform(I))
+      continue;
+
+    bool WasDivergent = isDivergent(I);
+    if (WasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (isa<TerminatorInst>(I)) {
+      auto &Term = cast<TerminatorInst>(I);
+      if (updateTerminator(Term)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(Term);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool DivergentUpd = false;
+    const auto *Phi = dyn_cast<const PHINode>(&I);
+    if (Phi) {
+      DivergentUpd = updatePHINode(*Phi);
+    } else {
+      DivergentUpd = updateNormalInstruction(I);
+    }
+
+    // propagate value divergence to users
+    if (DivergentUpd) {
+      markDivergent(I);
+      pushUsers(I);
+    }
+  }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
+  return UniformOverrides.find(&V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const Value &V) const {
+  return DivergentValues.find(&V) != DivergentValues.end();
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
+  if (DivergentValues.empty())
+    return;
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &I : instructions(F)) {
+    if (isDivergent(I))
+      OS << "DIVERGENT:" << I << '\n';
+  }
+}
+
+// class LoopDivergenceAnalysis
+LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT,
+                                               const LoopInfo &LI,
+                                               SyncDependenceAnalysis &SDA,
+                                               const Loop &Loop)
+    : DA(*Loop.getHeader()->getParent(), &Loop, DT, LI, SDA, true) {
+  for (const auto &Phi : Loop.getHeader()->phis()) {
+    DA.markDivergent(Phi);
+  }
+
+  // after the scalar remainder loop is extracted, the loop exit condition will
+  // be uniform
+  auto LoopExitingInst = Loop.getExitingBlock()->getTerminator();
+  auto LoopExitCond = cast<BranchInst>(LoopExitingInst)->getCondition();
+  DA.addUniformOverride(*LoopExitCond);
+
+  DA.compute();
+}
+
+bool LoopDivergenceAnalysis::isDivergent(const Value &V) const {
+  return DA.isDivergent(V);
+}
+
+void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *Mod) const {
+  OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n";
+  DA.print(OS, Mod);
+  OS << "}\n";
+}
+
+// class LoopDivergencePrinter
+bool LoopDivergencePrinter::runOnFunction(Function &F) {
+  const PostDominatorTree &PDT =
+      getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  const DominatorTree &DT =
+      getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SDA = make_unique<SyncDependenceAnalysis>(DT, PDT, LI);
+
+  for (auto &BB : F) {
+    auto *Loop = LI.getLoopFor(&BB);
+    if (!Loop || Loop->getHeader() != &BB)
+      continue;
+    LoopDivInfo.push_back(
+        make_unique<LoopDivergenceAnalysis>(DT, LI, *SDA, *Loop));
+  }
+
+  return false;
+}
+
+void LoopDivergencePrinter::print(raw_ostream &OS, const Module *Mod) const {
+  for (auto &DivInfo : LoopDivInfo) {
+    DivInfo->print(OS, Mod);
+  }
+}
+
+// Register this pass.
+char LoopDivergencePrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence",
+                      "Loop Divergence Printer", false, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence",
+                    "Loop Divergence Printer", false, true)
+
+FunctionPass *llvm::createLoopDivergencePrinterPass() {
+  return new LoopDivergencePrinter();
+}
+
+void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+// class GPUDivergenceAnalysis
+GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
+                                             const DominatorTree &DT,
+                                             const PostDominatorTree &PDT,
+                                             const LoopInfo &LI,
+                                             const TargetTransformInfo &TTI)
+    : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
+  for (auto &I : instructions(F)) {
+    if (TTI.isSourceOfDivergence(&I)) {
+      DA.markDivergent(I);
+    } else if (TTI.isAlwaysUniform(&I)) {
+      DA.addUniformOverride(I);
+    }
+  }
+  for (auto &Arg : F.args()) {
+    if (TTI.isSourceOfDivergence(&Arg)) {
+      DA.markDivergent(Arg);
+    }
+  }
+
+  DA.compute();
+}
+
+bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
+  return DA.isDivergent(val);
+}
+
+void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
+  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
+}
Index: lib/Analysis/LegacyDivergenceAnalysis.cpp
===================================================================
--- lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -1,4 +1,5 @@
-//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==//
+//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis
+//Implementation -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -64,6 +65,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -79,6 +83,12 @@
 
 #define DEBUG_TYPE "divergence"
 
+// transparently use the GPUDivergenceAnalysis
+static cl::opt<bool> UseGPUDA("use-gpu-divergence-analysis", cl::init(false),
+                              cl::Hidden,
+                              cl::desc("turn the LegacyDivergenceAnalysis into "
+                                       "a wrapper for GPUDivergenceAnalysis"));
+
 namespace {
 
 class DivergencePropagator {
@@ -262,16 +272,17 @@
   }
 }
 
-} /// end namespace anonymous
+} // namespace
 
 // Register this pass.
 char LegacyDivergenceAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
-                      false, true)
+INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence",
+                      "Legacy Divergence Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
-                    false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence",
+                    "Legacy Divergence Analysis", false, true)
 
 FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
   return new LegacyDivergenceAnalysis();
@@ -280,9 +291,24 @@
 void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
+  if (UseGPUDA)
+    AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
+bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis(
+    const Function &F) const {
+  if (!UseGPUDA)
+    return false;
+
+  // GPUDivergenceAnalysis requires a reducible CFG.
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  using RPOTraversal = ReversePostOrderTraversal<const Function *>;
+  RPOTraversal FuncRPOT(&F);
+  return !containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
+                                 const LoopInfo>(FuncRPOT, LI);
+}
+
 bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
   auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
   if (TTIWP == nullptr)
@@ -295,36 +321,59 @@
     return false;
 
   DivergentValues.clear();
+  gpuDA = nullptr;
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  DivergencePropagator DP(F, TTI,
-                          getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          PDT, DivergentValues);
-  DP.populateWithSourcesOfDivergence();
-  DP.propagate();
-  LLVM_DEBUG(
-    dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
-    print(dbgs(), F.getParent())
-  );
+
+  if (shouldUseGPUDivergenceAnalysis(F)) {
+    // run the new GPU divergence analysis
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
+
+  } else {
+    // run LLVM's existing DivergenceAnalysis
+    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
+    DP.populateWithSourcesOfDivergence();
+    DP.propagate();
+  }
+
+  LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName()
+                    << ":\n";
+             print(dbgs(), F.getParent()));
+
   return false;
 }
 
+bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const {
+  if (gpuDA) {
+    return gpuDA->isDivergent(*V);
+  }
+  return DivergentValues.count(V);
+}
+
 void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
-  if (DivergentValues.empty())
+  if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
     return;
-  const Value *FirstDivergentValue = *DivergentValues.begin();
+
   const Function *F;
-  if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
-    F = Arg->getParent();
-  } else if (const Instruction *I =
-                 dyn_cast<Instruction>(FirstDivergentValue)) {
-    F = I->getParent()->getParent();
-  } else {
-    llvm_unreachable("Only arguments and instructions can be divergent");
+  if (!DivergentValues.empty()) {
+    const Value *FirstDivergentValue = *DivergentValues.begin();
+    if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
+      F = Arg->getParent();
+    } else if (const Instruction *I =
+                   dyn_cast<Instruction>(FirstDivergentValue)) {
+      F = I->getParent()->getParent();
+    } else {
+      llvm_unreachable("Only arguments and instructions can be divergent");
+    }
+  } else if (gpuDA) {
+    F = &gpuDA->getFunction();
   }
 
   // Dumps all divergent values in F, arguments and then instructions.
   for (auto &Arg : F->args()) {
-    OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : "           ");
+    OS << (isDivergent(&Arg) ? "DIVERGENT: " : "           ");
     OS << Arg << "\n";
   }
   // Iterate instructions using instructions() to ensure a deterministic order.
@@ -332,7 +381,7 @@
     auto &BB = *BI;
     OS << "\n           " << BB.getName() << ":\n";
     for (auto &I : BB.instructionsWithoutDebug()) {
-      OS << (DivergentValues.count(&I) ? "DIVERGENT:     " : "               ");
+      OS << (isDivergent(&I) ? "DIVERGENT:     " : "               ");
       OS << I << "\n";
     }
   }
Index: lib/Analysis/SyncDependenceAnalysis.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/SyncDependenceAnalysis.cpp
@@ -0,0 +1,381 @@
+//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation
+//--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
+                                               const PostDominatorTree &PDT,
+                                               const LoopInfo &LI)
+    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const Function *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  // identified join points
+  std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const BasicBlock *, 4> ReachedLoopExits;
+
+  // if DefMap[B] == C then C is the dominating definition at block B
+  // if DefMap[B] ~ undef then we haven't seen B yet
+  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+  // an immediate successor of X (initial value).
+  using DefiningBlockMap = std::map<const BasicBlock *, const BasicBlock *>;
+  DefiningBlockMap DefMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const BasicBlock *> PendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT,
+                       const PostDominatorTree &PDT, const LoopInfo &LI)
+      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+        JoinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) {
+    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+    if (WasAdded)
+      PendingUpdates.insert(&Block);
+  }
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::DefMap {\n";
+    for (const auto *Block : FuncRPOT) {
+      auto It = DefMap.find(Block);
+      Out << Block->getName() << " : ";
+      if (It == DefMap.end()) {
+        Out << "\n";
+      } else {
+        const auto *DefBlock = It->second;
+        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop,
+                      const BasicBlock &DefBlock) {
+
+    // @succBlock is a loop exit
+    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+      DefMap.emplace(&SuccBlock, &DefBlock);
+      ReachedLoopExits.insert(&SuccBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto ItLastDef = DefMap.find(&SuccBlock);
+    if (ItLastDef == DefMap.end()) {
+      addPending(SuccBlock, DefBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (ItLastDef->second != &DefBlock) {
+      // do we know this join already?
+      if (!JoinBlocks->insert(&SuccBlock).second)
+        return;
+
+      // update the definition
+      addPending(SuccBlock, SuccBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent TerminatorInsts and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (Loop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the Loop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  std::unique_ptr<ConstBlockSet>
+  computeJoinPoints(const BasicBlock &RootBlock,
+                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
+    assert(JoinBlocks);
+
+    // immediate post dominator (no join block beyond that block)
+    const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(&RootBlock));
+    const auto *IpdNode = PdNode->getIDom();
+    const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+
+    // bootstrap with branch targets
+    for (const auto *SuccBlock : NodeSuccessors) {
+      DefMap.emplace(SuccBlock, SuccBlock);
+
+      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+        // immediate loop exit from node.
+        ReachedLoopExits.insert(SuccBlock);
+        continue;
+      } else {
+        // regular successor
+        PendingUpdates.insert(SuccBlock);
+      }
+    }
+
+    auto ItBeginRPO = FuncRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
+    }
+
+    auto ItEndRPO = FuncRPOT.end();
+    assert(ItBeginRPO != ItEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto ItBlockRPO = ItBeginRPO;
+    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+      const auto *Block = *ItBlockRPO;
+
+      // skip @block if not pending update
+      auto ItPending = PendingUpdates.find(Block);
+      if (ItPending == PendingUpdates.end())
+        continue;
+      PendingUpdates.erase(ItPending);
+
+      // propagate definition at @block to its successors
+      auto ItDef = DefMap.find(Block);
+      const auto *DefBlock = ItDef->second;
+      assert(DefBlock);
+
+      auto *BlockLoop = LI.getLoopFor(Block);
+      if (ParentLoop &&
+          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<BasicBlock *, 4> BlockLoopExits;
+        BlockLoop->getExitBlocks(BlockLoopExits);
+        for (const auto *BlockLoopExit : BlockLoopExits) {
+          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *SuccBlock : successors(Block)) {
+          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const BasicBlock *ParentLoopHeader =
+        ParentLoop ? ParentLoop->getHeader() : nullptr;
+    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!ReachedLoopExits.empty()) {
+      assert(ParentLoop);
+      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+      assert(HeaderDefBlock && "no definition in header of carrying loop");
+
+      for (const auto *ExitBlock : ReachedLoopExits) {
+        auto ItExitDef = DefMap.find(ExitBlock);
+        assert((ItExitDef != DefMap.end()) &&
+               "no reaching def at reachable loop exit");
+        if (ItExitDef->second != HeaderDefBlock) {
+          JoinBlocks->insert(ExitBlock);
+        }
+      }
+    }
+
+    return std::move(JoinBlocks);
+  }
+};
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
+  using LoopExitVec = SmallVector<BasicBlock *, 4>;
+  LoopExitVec LoopExits;
+  Loop.getExitBlocks(LoopExits);
+  if (LoopExits.size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedLoopExitJoins.find(&Loop);
+  if (ItCached != CachedLoopExitJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
+
+  auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const TerminatorInst &Term) {
+  // trivial case
+  if (Term.getNumSuccessors() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedBranchJoins.find(&Term);
+  if (ItCached != CachedBranchJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  const auto &TermBlock = *Term.getParent();
+  auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
+      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
+
+  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+} // namespace llvm
Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
@@ -0,0 +1,14 @@
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+define amdgpu_kernel void @workitem_id_x() #1 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+; CHECK-NOT: DIVERGENT:  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
@@ -0,0 +1,45 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
+define i32 @test1(i32* %ptr, i32 %val) #0 {
+  %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
+  ret i32 %orig
+}
+
+; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) {
+  %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret {i32, i1} %orig
+}
+
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind argmemonly }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -0,0 +1,223 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the divergent join point in %exit is obscured by uniform control joining in %X
+define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the phi nodes in X and Y don't actually receive divergent values
+define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unobserved_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] 
+  br label %exit
+
+Y:
+  %uni.merge.y = phi i32 [ %b, %B ]
+  br label %exit
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to D>) inside divergent loop (H<header>, B<exiting to X>, C<exiting to Y>, D, G, L)
+; the inner loop has no exit to top level.
+; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D.
+define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_nestedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %X, label %C 
+
+C:
+  br i1 %uni.cond, label %Y, label %D
+
+D:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %C, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %D, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to X>) in divergent loop (H<header>, B<exiting to C>, C, G, L)
+; the outer loop has no immediately divergent exiting edge.
+; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent.
+define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_doublebreak_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %Y, label %C 
+
+C:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %X, label %L ; two-level break
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %C, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence
+  br label %Y
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.y =
+}
+
+; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)
+define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_containedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ]
+  br label %D
+; CHECK: DIVERGENT: %div.merge.b =
+
+G:
+  %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ]
+  br i1 %div.exitx, label %B, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  %uni.inc.l = add i32 %uni.merge.g, 1
+  br i1 %uni.cond, label %G, label %D
+
+D:
+  %uni.inc.d = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ]
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -0,0 +1,13 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+  %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
+  store i32 %swizzle, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind readnone convergent }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/irreducible.ll
@@ -0,0 +1,48 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; This test contains an unstructured loop.
+;           +-------------- entry ----------------+
+;           |                                     |
+;           V                                     V
+; i1 = phi(0, i3)                            i2 = phi(0, i3)
+;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
+;           ^                 |                   ^
+;           |                 V                   |
+;           +-------- switch (tid / i3) ----------+
+;                             |
+;                             V
+;                        if (i3 == 5) // divergent
+; because sync dependent on (tid / i3).
+define i32 @unstructured_loop(i1 %entry_cond) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
+loop_entry_1:
+  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j1 = add i32 %i1, 1
+  br label %loop_body
+loop_entry_2:
+  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j2 = add i32 %i2, 2
+  br label %loop_body
+loop_body:
+  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
+  br label %loop_latch
+loop_latch:
+  %div = sdiv i32 %tid, %i3
+  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
+                                   i32 2, label %loop_entry_2 ]
+branch:
+  %cmp = icmp eq i32 %i3, 5
+  br i1 %cmp, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cmp,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -0,0 +1,41 @@
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_ps':
+; CHECK: DIVERGENT:
+; CHECK-NOT: %arg0
+; CHECK-NOT: %arg1
+; CHECK-NOT: %arg2
+; CHECK: <2 x i32> %arg3
+; CHECK: DIVERGENT:  <3 x i32> %arg4
+; CHECK: DIVERGENT:  float %arg5
+; CHECK: DIVERGENT:  i32 %arg6
+
+define amdgpu_ps void @test_amdgpu_ps([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_amdgpu_kernel':
+; CHECK-NOT: %arg0
+; CHECK-NOT: %arg1
+; CHECK-NOT: %arg2
+; CHECK-NOT: %arg3
+; CHECK-NOT: %arg4
+; CHECK-NOT: %arg5
+; CHECK-NOT: %arg6
+define amdgpu_kernel void @test_amdgpu_kernel([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'test_c':
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+; CHECK: DIVERGENT:
+define void @test_c([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -0,0 +1,103 @@
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(
+define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add(
+define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(
+define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(
+define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(
+define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(
+define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(
+define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and(
+define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or(
+define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(
+define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(
+define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0
+declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -0,0 +1,131 @@
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(
+define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(
+define float @image_atomic_add(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(
+define float @image_atomic_sub(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(
+define float @image_atomic_smin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(
+define float @image_atomic_umin(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(
+define float @image_atomic_smax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(
+define float @image_atomic_umax(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(
+define float @image_atomic_and(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(
+define float @image_atomic_or(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(
+define float @image_atomic_xor(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(
+define float @image_atomic_inc(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(
+define float @image_atomic_dec(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(
+define float @image_atomic_cmpswap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data, i32 inreg %cmp) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %data, i32 %cmp, i32 %addr, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(
+define float @image_atomic_add_2d(<8 x i32> inreg %rsrc, i32 inreg %s, i32 inreg %t, i32 inreg %data) #0 {
+main_body:
+  %orig = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %r = bitcast i32 %orig to float
+  ret float %r
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
 
 ; Test that we consider loads from flat and private addrspaces to be divergent.
 
Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+; CHECK: DIVERGENT:  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2
+; CHECK: DIVERGENT:  %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4
+; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
+
+; The post dominator tree does not have a root node in this case
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+bb0:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %tmp2 = sext i32 %tmp to i64
+  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2
+  %tmp6 = load volatile float, float addrspace(1)* %tmp5, align 4
+  %tmp8 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp8, label %bb1, label %bb2
+
+bb1:
+  %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4
+  br label %bb2
+
+bb2:
+  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
+  br label %bb1
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
@@ -0,0 +1,31 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; CHECK-LABEL: 'test1':
+; CHECK-NEXT: DIVERGENT: i32 %bound
+; CHECK: {{^  *}}%counter =
+; CHECK-NEXT: DIVERGENT: %break = icmp sge i32 %counter, %bound
+; CHECK-NEXT: DIVERGENT: br i1 %break, label %footer, label %body
+; CHECK: {{^  *}}%counter.next =
+; CHECK: {{^  *}}%counter.footer =
+; CHECK: DIVERGENT: br i1 %break, label %end, label %header
+; Note: %counter is not divergent!
+define amdgpu_ps void @test1(i32 %bound) {
+entry:
+  br label %header
+
+header:
+  %counter = phi i32 [ 0, %entry ], [ %counter.footer, %footer ]
+  %break = icmp sge i32 %counter, %bound
+  br i1 %break, label %footer, label %body
+
+body:
+  %counter.next = add i32 %counter, 1
+  br label %footer
+
+footer:
+  %counter.footer = phi i32 [ %counter.next, %body ], [ undef, %header ]
+  br i1 %break, label %end, label %header
+
+end:
+  ret void
+}
Index: test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/temporal_diverge.ll
@@ -0,0 +1,154 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+; temporal-divergent use of value carried by divergent loop
+define amdgpu_kernel void @temporal_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  %div.user = add i32 %uni.inc, 5
+  ret void
+}
+
+; temporal-divergent use of value carried by divergent loop inside a top-level loop
+define amdgpu_kernel void @temporal_diverge_inloop(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_inloop':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %G
+
+G:
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  %div.alsouser = add i32 %uni.inc, 5
+  ret void
+}
+
+
+; temporal-uniform use of a valud, definition and users are carried by a surrounding divergent loop
+define amdgpu_kernel void @temporal_uniform_indivloop(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_uniform_indivloop':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %G
+
+G:
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %G ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %X, label %H ; divergent branch
+
+X:
+  %uni.user = add i32 %uni.inc, 5
+  %div.exity = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.exity =  
+  br i1 %div.exity, label %G, label %Y
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+Y:
+  %div.alsouser = add i32 %uni.inc, 5
+  ret void
+}
+
+
+; temporal-divergent use of value carried by divergent loop, user is inside sibling loop
+define amdgpu_kernel void @temporal_diverge_loopuser(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  br label %G
+
+G:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  ret void
+}
+
+; temporal-divergent use of value carried by divergent loop, user is inside sibling loop, defs and use are carried by a uniform loop
+define amdgpu_kernel void @temporal_diverge_loopuser_nested(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'temporal_diverge_loopuser_nested':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %H ]
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+X:
+  br label %G
+
+G:
+  %div.user = add i32 %uni.inc, 5
+  br i1 %uni.cond, label %G, label %Y
+
+Y:
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+
+; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+define amdgpu_kernel void @workitem_id_x() #1 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %id.x, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+define amdgpu_kernel void @workitem_id_y() #1 {
+  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %id.y, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+define amdgpu_kernel void @workitem_id_z() #1 {
+  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %id.z, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
+define amdgpu_kernel void @mbcnt_lo() #1 {
+  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
+  store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+define amdgpu_kernel void @mbcnt_hi() #1 {
+  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+  store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll
@@ -0,0 +1,74 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ]
+; CHECK-NEXT: DIVERGENT:  %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  store double %x.0.lcssa, double* %arrayidx10, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+; CHECK-NEXT: DIVERGENT:  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  %2 = load double, double* %arrayidx8, align 8
+; CHECK-NEXT: DIVERGENT:  %add = fadd double %x.025, %2
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body4 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %0 = load i32, i32* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %idxprom5 = sext i32 %0 to i64
+; CHECK-NEXT: DIVERGENT:  %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5
+; CHECK-NEXT: DIVERGENT:  %1 = load double*, double** %arrayidx6, align 8
+; CHECK-NEXT: DIVERGENT:  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  %2 = load double, double* %arrayidx8, align 8
+; CHECK-NEXT: DIVERGENT:  %add = fadd double %x.025, %2
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+
+; Function Attrs: norecurse nounwind uwtable
+define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp224 = icmp sgt i32 %m, 0
+  %wide.trip.count = zext i32 %m to i64
+  %wide.trip.count31 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup3, %for.body.lr.ph
+  %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ]
+  br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3
+
+for.body4.preheader:                              ; preds = %for.body
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.body
+  %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ]
+  %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29
+  store double %x.0.lcssa, double* %arrayidx10, align 8
+  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+  %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31
+  br i1 %exitcond32, label %for.cond.cleanup, label %for.body
+
+for.body4:                                        ; preds = %for.body4.preheader, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ]
+  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom5 = sext i32 %0 to i64
+  %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5
+  %1 = load double*, double** %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+  %2 = load double, double* %arrayidx8, align 8
+  %add = fadd double %x.025, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+}
+
+attributes #0 = { norecurse nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll
@@ -0,0 +1,44 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+; CHECK-NEXT: DIVERGENT:  %hfreq = srem i64 %indvars.iv, 2
+; CHECK-NEXT: DIVERGENT:  %toggle = trunc i64 %hfreq to i1
+; CHECK-NEXT: DIVERGENT:  br i1 %toggle, label %A, label %B
+; CHECK-NEXT: DIVERGENT:  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %divphi, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+  %hfreq = srem i64 %indvars.iv, 2
+  %toggle = trunc i64 %hfreq to i1
+  br i1 %toggle, label %A, label %B
+
+A:
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  br label %B
+
+B:
+  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+  store float %divphi, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll
@@ -0,0 +1,60 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1':
+; CHECK-NEXT: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+; CHECK-NEXT: DIVERGENT:  %row = mul i64 %n, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  br i1 %exitcond2, label %for.latch, label %for.body2
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body2 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+; CHECK-NEXT: }
+
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %exit
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+  br label %for.body2
+
+for.body2:
+  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+  %row = mul i64 %n, %indvars.iv
+  %idx = add i64 %row, %indvars.iv2
+  %trunc = trunc i64 %idx to i32
+  %val = sitofp i32 %trunc to float
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+  store float %val, float* %arrayidx, align 4
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+  %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv
+  br i1 %exitcond2, label %for.latch, label %for.body2
+
+for.latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+}
+
+attributes #0 = { nounwind }
+
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll
@@ -0,0 +1,31 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %cast, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %A, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  store float %cast, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll
@@ -0,0 +1,39 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %divphi, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+  %invar = trunc i64 %n to i1
+  br i1 %invar, label %A, label %B
+
+A:
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  br label %B
+
+B:
+  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+  store float %divphi, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll
@@ -0,0 +1,55 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+; CHECK-NEXT: DIVERGENT:  %row = mul i64 %n, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body2 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %exit
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+  br label %for.body2
+
+for.body2:
+  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+  %row = mul i64 %n, %indvars.iv
+  %idx = add i64 %row, %indvars.iv2
+  %trunc = trunc i64 %idx to i32
+  %val = sitofp i32 %trunc to float
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+  store float %val, float* %arrayidx, align 4
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+  %exitcond2 = icmp eq i64 %indvars.iv.next2, %n
+  br i1 %exitcond2, label %for.latch, label %for.body2
+
+for.latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll
@@ -0,0 +1,110 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ]
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body8.lr.ph {
+; CHECK-NEXT: DIVERGENT:  %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ]
+; CHECK-NEXT: DIVERGENT:  %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ]
+; CHECK-NEXT: DIVERGENT:  %1 = sext i32 %mul44 to i64
+; CHECK-NEXT: DIVERGENT:  %2 = sext i32 %len.043 to i64
+; CHECK-NEXT: DIVERGENT:  %mul = shl nsw i32 %mul44, 1
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+; CHECK-NEXT: DIVERGENT:  %3 = add nsw i64 %indvars.iv, %2
+; CHECK-NEXT: DIVERGENT:  %4 = mul nsw i64 %3, %0
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %7 = mul nsw i64 %indvars.iv, %0
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add i64 %indvars.iv, %1
+; CHECK-NEXT: DIVERGENT:  %cmp6 = icmp slt i64 %indvars.iv.next, %0
+; CHECK-NEXT: DIVERGENT:  br i1 %cmp6, label %for.body8, label %for.cond.cleanup7
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body8 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+; CHECK-NEXT: DIVERGENT:  %3 = add nsw i64 %indvars.iv, %2
+; CHECK-NEXT: DIVERGENT:  %4 = mul nsw i64 %3, %0
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %7 = mul nsw i64 %indvars.iv, %0
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add i64 %indvars.iv, %1
+; CHECK-NEXT: }
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 {
+entry:
+  %cmp45 = icmp sgt i32 %n, 0
+  br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp242 = icmp sgt i32 %n, 2
+  %0 = sext i32 %n to i64
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup3, %for.body.lr.ph
+  %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ]
+  br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3
+
+for.body8.lr.ph.preheader:                        ; preds = %for.body
+  br label %for.body8.lr.ph
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7, %for.body
+  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+  %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.body8.lr.ph:                                  ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7
+  %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ]
+  %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ]
+  %1 = sext i32 %mul44 to i64
+  %2 = sext i32 %len.043 to i64
+  br label %for.body8
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %mul = shl nsw i32 %mul44, 1
+  %cmp2 = icmp slt i32 %mul, %n
+  br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
+  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+  %3 = add nsw i64 %indvars.iv, %2
+  %4 = mul nsw i64 %3, %0
+  %5 = add nsw i64 %4, %indvars.iv53
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+  %6 = load double, double* %arrayidx, align 8
+  %7 = mul nsw i64 %indvars.iv, %0
+  %8 = add nsw i64 %7, %indvars.iv53
+  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+  %9 = load double, double* %arrayidx14, align 8
+  %add15 = fadd double %6, %9
+  store double %add15, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, %1
+  %cmp6 = icmp slt i64 %indvars.iv.next, %0
+  br i1 %cmp6, label %for.body8, label %for.cond.cleanup7
+}
+
+attributes #0 = { norecurse nounwind uwtable }
Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float 4.200000e+01, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %A, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float 4.200000e+01, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
@@ -0,0 +1,47 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @daorder(i32 %n) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'daorder'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond = icmp slt i32 %tid, 0
+  br i1 %cond, label %A, label %B ; divergent
+; CHECK: DIVERGENT: br i1 %cond,
+A:
+  %defAtA = add i32 %n, 1 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtA = 
+  br label %C
+B:
+  %defAtB = add i32 %n, 2 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtB = 
+  br label %C
+C:
+  %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent
+; CHECK: DIVERGENT: %defAtC =
+  br label %D
+
+D:
+  %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform
+; CHECK-NOT: DIVERGENT: %i = phi
+  br label %E
+
+E:
+  %i.inc = add i32 %i, 1
+  %loopCnt = icmp slt i32 %i.inc, %n
+; CHECK-NOT: DIVERGENT: %loopCnt =
+  br i1 %loopCnt, label %D, label %exit
+
+exit:
+  ret i32 %n
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32)* @daorder, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
@@ -0,0 +1,175 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
+define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'no_diverge'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond = icmp slt i32 %n, 0
+  br i1 %cond, label %then, label %else ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond,
+then:
+  %a1 = add i32 %a, %tid
+  br label %merge
+else:
+  %b2 = add i32 %b, %tid
+  br label %merge
+merge:
+  %c = phi i32 [ %a1, %then ], [ %b2, %else ]
+  ret i32 %c
+}
+
+; c = a;
+; if (threadIdx.x < 5)    // divergent: data dependent
+;   c = b;
+; return c;               // c is divergent: sync dependent
+define i32 @sync(i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'sync'
+bb1:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  %cond = icmp slt i32 %tid, 5
+  br i1 %cond, label %bb2, label %bb3
+; CHECK: DIVERGENT: br i1 %cond,
+bb2:
+  br label %bb3
+bb3:
+  %c = phi i32 [ %a, %bb1 ], [ %b, %bb2 ] ; sync dependent on tid
+; CHECK: DIVERGENT: %c =
+  ret i32 %c
+}
+
+; c = 0;
+; if (threadIdx.x >= 5) {  // divergent
+;   c = (n < 0 ? a : b);  // c here is uniform because n is uniform
+; }
+; // c here is divergent because it is sync dependent on threadIdx.x >= 5
+; return c;
+define i32 @mixed(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'mixed'
+bb1:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  %cond = icmp slt i32 %tid, 5
+  br i1 %cond, label %bb6, label %bb2
+; CHECK: DIVERGENT: br i1 %cond,
+bb2:
+  %cond2 = icmp slt i32 %n, 0
+  br i1 %cond2, label %bb4, label %bb3
+bb3:
+  br label %bb5
+bb4:
+  br label %bb5
+bb5:
+  %c = phi i32 [ %a, %bb3 ], [ %b, %bb4 ]
+; CHECK-NOT: DIVERGENT: %c =
+  br label %bb6
+bb6:
+  %c2 = phi i32 [ 0, %bb1], [ %c, %bb5 ]
+; CHECK: DIVERGENT: %c2 =
+  ret i32 %c2
+}
+
+; We conservatively treats all parameters of a __device__ function as divergent.
+define i32 @device(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'device'
+; CHECK: DIVERGENT: i32 %n
+; CHECK: DIVERGENT: i32 %a
+; CHECK: DIVERGENT: i32 %b
+entry:
+  %cond = icmp slt i32 %n, 0
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  br label %merge
+else:
+  br label %merge
+merge:
+  %c = phi i32 [ %a, %then ], [ %b, %else ]
+  ret i32 %c
+}
+
+; int i = 0;
+; do {
+;   i++;                  // i here is uniform
+; } while (i < laneid);
+; return i == 10 ? 0 : 1; // i here is divergent
+;
+; The i defined in the loop is used outside.
+define i32 @loop() {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'loop'
+entry:
+  %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+; CHECK-NOT: DIVERGENT: %i =
+  %i1 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i1, %laneid
+  br i1 %exit_cond, label %loop_exit, label %loop
+loop_exit:
+  %cond = icmp eq i32 %i, 10
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+; Same as @loop, but the loop is in the LCSSA form.
+define i32 @lcssa() {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'lcssa'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+; CHECK-NOT: DIVERGENT: %i =
+  %i1 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i1, %tid
+  br i1 %exit_cond, label %loop_exit, label %loop
+loop_exit:
+  %i.lcssa = phi i32 [ %i, %loop ]
+; CHECK: DIVERGENT: %i.lcssa =
+  %cond = icmp eq i32 %i.lcssa, 10
+  br i1 %cond, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cond,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+; Verifies sync-dependence is computed correctly in the absense of loops.
+define i32 @sync_no_loop(i32 %arg) {
+entry:
+  %0 = add i32 %arg, 1
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %1 = icmp sge i32 %tid, 10
+  br i1 %1, label %bb1, label %bb2
+
+bb1:
+  br label %bb3
+
+bb2:
+  br label %bb3
+
+bb3:
+  %2 = add i32 %0, 2
+  ; CHECK-NOT: DIVERGENT: %2
+  ret i32 %2
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4}
+!0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
+!1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
+!2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
+!3 = !{i32 ()* @loop, !"kernel", i32 1}
+!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret i32 %phi.ipd
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/irreducible.ll
@@ -0,0 +1,55 @@
+; RUN: opt %s -analyze -divergence -use-gpu-divergence-analysis | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; This test contains an unstructured loop.
+;           +-------------- entry ----------------+
+;           |                                     |
+;           V                                     V
+; i1 = phi(0, i3)                            i2 = phi(0, i3)
+;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
+;           ^                 |                   ^
+;           |                 V                   |
+;           +-------- switch (tid / i3) ----------+
+;                             |
+;                             V
+;                        if (i3 == 5) // divergent
+; because sync dependent on (tid / i3).
+define i32 @unstructured_loop(i1 %entry_cond) {
+; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'unstructured_loop'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
+loop_entry_1:
+  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j1 = add i32 %i1, 1
+  br label %loop_body
+loop_entry_2:
+  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
+  %j2 = add i32 %i2, 2
+  br label %loop_body
+loop_body:
+  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
+  br label %loop_latch
+loop_latch:
+  %div = sdiv i32 %tid, %i3
+  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
+                                   i32 2, label %loop_entry_2 ]
+branch:
+  %cmp = icmp eq i32 %i3, 5
+  br i1 %cmp, label %then, label %else
+; CHECK: DIVERGENT: br i1 %cmp,
+then:
+  ret i32 0
+else:
+  ret i32 1
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True