Index: include/llvm/ADT/PostOrderIterator.h
===================================================================
--- include/llvm/ADT/PostOrderIterator.h
+++ include/llvm/ADT/PostOrderIterator.h
@@ -296,12 +296,15 @@
 
 public:
   using rpo_iterator = typename std::vector<NodeRef>::reverse_iterator;
+  using const_rpo_iterator = typename std::vector<NodeRef>::const_reverse_iterator;
 
   ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); }
 
   // Because we want a reverse post order, use reverse iterators from the vector
   rpo_iterator begin() { return Blocks.rbegin(); }
+  const_rpo_iterator begin() const { return Blocks.crbegin(); }
   rpo_iterator end() { return Blocks.rend(); }
+  const_rpo_iterator end() const { return Blocks.crend(); }
 };
 
 } // end namespace llvm
Index: include/llvm/Analysis/DivergenceAnalysis.h
===================================================================
--- include/llvm/Analysis/DivergenceAnalysis.h
+++ include/llvm/Analysis/DivergenceAnalysis.h
@@ -7,55 +7,215 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The divergence analysis is an LLVM pass which can be used to find out
-// if a branch instruction in a GPU program is divergent or not. It can help
-// branch optimizations such as jump threading and loop unswitching to make
-// better decisions.
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
-#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+
+#ifndef LLVM_ANALYSIS_DIVERGENCEANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCEANALYSIS_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
+#include <vector>
 
 namespace llvm {
+class Module;
 class Value;
-class DivergenceAnalysis : public FunctionPass {
+class Instruction;
+class Loop;
+class raw_ostream;
+class TargetTransformInfo;
+
+// generic divergence analysis
+class DivergenceAnalysis {
+public:
+  // this instance will analyze the whole function @F unless @regionLoop is
+  // given. In that case the analysis is restricted to @regionLoop.
+  // The analyzed part of the CFG is refered to as the region.
+  DivergenceAnalysis(const Function &F, const Loop *regionLoop,
+                     const DominatorTree &DT, const LoopInfo &LI,
+                     SyncDependenceAnalysis &SDA, bool IsLCSSAForm);
+
+  // returns the loop that defines the analyzed region (if any)
+  const Loop *getRegionLoop() const { return regionLoop; }
+  const Function &getFunction() const { return F; }
+
+  // whether @I is part of the region
+  bool inRegion(const BasicBlock &BB) const;
+  bool inRegion(const Instruction &I) const;
+
+  // mark @uniVal as a value that is always uniform
+  void addUniformOverride(const Value &uniVal);
+
+  // mark @divVal as a value that is always divergent
+  void markDivergent(const Value &divVal);
+
+  // propagate divergence to all instructions in the region.
+  // Divergence is seeded by calls to @markDivergent.
+  void compute();
+
+  // whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !divergentValues.empty(); }
+
+  // whether @val will always return a uniform values regardless of its operands
+  bool isAlwaysUniform(const Value &val) const;
+
+  // whether @val is a divergent value
+  bool isDivergent(const Value &val) const;
+
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  bool updateTerminator(const TerminatorInst &term) const;
+  bool updatePHINode(const PHINode &phi) const;
+
+  // returns whether @term should be considered divergent based on the
+  // divergence of its operands. This should only be called for
+  // non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const Instruction &term) const;
+
+  // marks all users of live-out values of the loop headed by @loopHeader
+  // as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const BasicBlock &loopHeader);
+
+  // push all users of @val (in the region) to the worklist
+  void pushUsers(const Value &I);
+
+  // push all phi nodes in @block to the worklist
+  void pushPHINodes(const BasicBlock &block);
+
+  // mark @block as join divergent
+  // A block is join divergent if two threads may reach it from different
+  // incoming blocks at the same time.
+  void markBlockJoinDivergent(const BasicBlock &block) {
+    divergentJoinBlocks.insert(&block);
+  }
+
+  // @whether @val appears as a temporal divergent value when observed in
+  // @observingBlock
+  bool isTemporalDivergent(const BasicBlock &observingBlock,
+                           const Value &val) const;
+
+  // @whether @block is join divergent (see markBlockJoinDivergent)
+  bool isJoinDivergent(const BasicBlock &block) const {
+    return divergentJoinBlocks.count(&block);
+  }
+
+  // propagate control-induced divergence to users (phi nodes and instructions)
+  // @joinBlock is a divergent loop exit or join point of two disjoint paths.
+  // returns whether @joinBlock is a divergent loop exit of @termLoop.
+  bool propagateJoinDivergence(const BasicBlock &joinBlock,
+                               const Loop *termLoop);
+
+  // propagate induced value divergence due to control divergence in @term
+  void propagateBranchDivergence(const TerminatorInst &term);
+
+  // propagate induced value divergence due to loop exit divergence from
+  // @exitingLoop
+  void propagateLoopDivergence(const Loop &exitingLoop);
+
+private:
+  const Function &F;
+  // if regionLoop != nullptr, analyze only in the scope of the loop
+  // Otw, analyze the whole function
+  const Loop *regionLoop;
+
+  const DominatorTree &DT;
+  const LoopInfo &LI;
+
+  // recognized divergent loops
+  DenseSet<const Loop *> divergentLoops;
+
+  // The SDA links divergent branches to divergent control-flow joins
+  SyncDependenceAnalysis &SDA;
+
+  // use simplified code path for LCSSA form
+  bool IsLCSSAForm;
+
+  // set of known-uniform values
+  DenseSet<const Value *> uniformOverrides;
+
+  // blocks with joining divergent control from different predecessors
+  DenseSet<const BasicBlock *> divergentJoinBlocks;
+
+  // detected/marked divergent values
+  DenseSet<const Value *> divergentValues;
+
+  // internal worklist for divergence propagation
+  std::vector<const Instruction *> worklist;
+};
+
+// divergence analysis frontend for loops
+class LoopDivergenceAnalysis {
+public:
+  LoopDivergenceAnalysis(const DominatorTree &DT, const LoopInfo &LI,
+                         SyncDependenceAnalysis &SDA, const Loop &loop);
+
+  // whether @V is divergent
+  bool isDivergent(const Value &val) const;
+
+  // whether @V is uniform/non-divergent
+  bool isUniform(const Value &val) const { return !isDivergent(val); }
+
+  // print all divergent values in the loop.
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  DivergenceAnalysis DA;
+};
+
+// loop divergence printer pass - for standalone testing
+class LoopDivergencePrinter : public FunctionPass {
 public:
   static char ID;
 
-  DivergenceAnalysis() : FunctionPass(ID) {
-    initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
+  LoopDivergencePrinter() : FunctionPass(ID) {
+    initializeLoopDivergencePrinterPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  // analyze all loop-divergence of all loops in @F and print the results
   bool runOnFunction(Function &F) override;
 
-  // Print all divergent branches in the function.
+  // print all divergent values in the function
   void print(raw_ostream &OS, const Module *) const override;
 
-  // Returns true if V is divergent at its definition.
-  //
-  // Even if this function returns false, V may still be divergent when used
-  // in a different basic block.
-  bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
+private:
+  std::unique_ptr<SyncDependenceAnalysis> SDA;
+  SmallVector<std::unique_ptr<LoopDivergenceAnalysis>, 6> loopDivInfo;
+};
 
-  // Returns true if V is uniform/non-divergent.
-  //
-  // Even if this function returns true, V may still be divergent when used
-  // in a different basic block.
-  bool isUniform(const Value *V) const { return !isDivergent(V); }
+// divergence analysis frontend for GPU kernels
+class GPUDivergenceAnalysis {
+  SyncDependenceAnalysis SDA;
+  DivergenceAnalysis DA;
 
-  // Keep the analysis results uptodate by removing an erased value.
-  void removeValue(const Value *V) { DivergentValues.erase(V); }
+public:
+  // runs the divergence analysis on @F, a GPU kernel
+  GPUDivergenceAnalysis(Function &F, const DominatorTree &DT,
+                        const PostDominatorTree &PDT, const LoopInfo &LI,
+                        const TargetTransformInfo &TTI);
 
-private:
-  // Stores all divergent values.
-  DenseSet<const Value *> DivergentValues;
+  // whether any divergence was detected in @F
+  bool hasDivergence() const { return DA.hasDetectedDivergence(); }
+
+  // the GPU kernel this analysis result is for
+  const Function &getFunction() const { return DA.getFunction(); }
+
+  // whether @V is divergent
+  bool isDivergent(const Value &val) const;
+
+  // whether @V is uniform/non-divergent
+  bool isUniform(const Value &val) const { return !isDivergent(val); }
+
+  // print all divergent values in the kernel F
+  void print(raw_ostream &OS, const Module *) const;
 };
-} // End llvm namespace
 
-#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
\ No newline at end of file
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DIVERGENCEANALYSIS_H
Index: include/llvm/Analysis/KernelDivergenceAnalysis.h
===================================================================
--- include/llvm/Analysis/KernelDivergenceAnalysis.h
+++ include/llvm/Analysis/KernelDivergenceAnalysis.h
@@ -1,4 +1,4 @@
-//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===//
+//===- llvm/Analysis/KernelDivergenceAnalysis.h - KernelDivergence Analysis -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The divergence analysis is an LLVM pass which can be used to find out
-// if a branch instruction in a GPU program is divergent or not. It can help
+// The kernel divergence analysis is an LLVM pass which can be used to find out
+// if a branch instruction in a GPU program (kernel) is divergent or not. It can help
 // branch optimizations such as jump threading and loop unswitching to make
 // better decisions.
 //
@@ -19,15 +19,17 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 
 namespace llvm {
 class Value;
-class DivergenceAnalysis : public FunctionPass {
+class GPUDivergenceAnalysis;
+class KernelDivergenceAnalysis : public FunctionPass {
 public:
   static char ID;
 
-  DivergenceAnalysis() : FunctionPass(ID) {
-    initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
+  KernelDivergenceAnalysis() : FunctionPass(ID) {
+    initializeKernelDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -41,7 +43,7 @@
   //
   // Even if this function returns false, V may still be divergent when used
   // in a different basic block.
-  bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
+  bool isDivergent(const Value *V) const;
 
   // Returns true if V is uniform/non-divergent.
   //
@@ -53,9 +55,12 @@
   void removeValue(const Value *V) { DivergentValues.erase(V); }
 
 private:
+  // (optional) handle to new DivergenceAnalysis
+  std::unique_ptr<GPUDivergenceAnalysis> gpuDA;
+
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
 };
 } // End llvm namespace
 
-#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
\ No newline at end of file
+#endif //LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
Index: include/llvm/Analysis/Passes.h
===================================================================
--- include/llvm/Analysis/Passes.h
+++ include/llvm/Analysis/Passes.h
@@ -61,10 +61,17 @@
 
   //===--------------------------------------------------------------------===//
   //
-  // createDivergenceAnalysisPass - This pass determines which branches in a GPU
+  // createKernelDivergenceAnalysisPass - This pass determines which branches in a GPU
   // program are divergent.
   //
-  FunctionPass *createDivergenceAnalysisPass();
+  FunctionPass *createKernelDivergenceAnalysisPass();
+
+  //===--------------------------------------------------------------------===//
+  //
+  // createLoopDivergencePrinterPass - This pass determines which branches and
+  // instructions in a loop are divergent.
+  //
+  FunctionPass *createLoopDivergencePrinterPass();
 
   //===--------------------------------------------------------------------===//
   //
Index: include/llvm/Analysis/SyncDependenceAnalysis.h
===================================================================
--- /dev/null
+++ include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -0,0 +1,75 @@
+//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H
+#define LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Loop;
+class PostDominatorTree;
+class TerminatorInst;
+class TerminatorInst;
+
+using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
+
+// TODO add LLVM pass manager wrappings
+class SyncDependenceAnalysis {
+  void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop,
+                      const BasicBlock *defBlock);
+
+public:
+  bool inRegion(const BasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT,
+                         const LoopInfo &LI);
+
+  // the set of blocks which are reachable by disjoint paths from @term.
+  // The set also contains loop exits if there two disjoin paths:
+  // one from @term to the loop exit and another from @term to the loop header.
+  // Those exit blocks are added to the returned set.
+  // If L is the parent loop of @term and an exit of L is in the returned set
+  // then L is a divergent loop.
+  const ConstBlockSet &join_blocks(const TerminatorInst &term);
+
+  // the set of blocks which are reachable by disjoin paths from the
+  // loop exits of @loop.
+  // This treats the loop as a single node in @loop's parent loop.
+  // The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const Loop &loop);
+
+private:
+  static ConstBlockSet emptyBlockSet;
+
+  ReversePostOrderTraversal<const Function *> funcRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  std::map<const Loop *, ConstBlockSet *> cachedLoopExitJoins;
+  std::map<const TerminatorInst *, ConstBlockSet *> cachedBranchJoins;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_BRANCHDEPENDENCEANALYSIS_H
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -289,7 +289,7 @@
   /// Returns whether V is a source of divergence.
   ///
   /// This function provides the target-dependent information for
-  /// the target-independent DivergenceAnalysis. DivergenceAnalysis first
+  /// the target-independent KernelDivergenceAnalysis. KernelDivergenceAnalysis first
   /// builds the dependency graph, and then runs the reachability algorithm
   /// starting with the sources of divergence.
   bool isSourceOfDivergence(const Value *V) const;
Index: include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- include/llvm/CodeGen/SelectionDAG.h
+++ include/llvm/CodeGen/SelectionDAG.h
@@ -28,7 +28,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -229,7 +229,7 @@
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
 
-  DivergenceAnalysis * DA = nullptr;
+  KernelDivergenceAnalysis * DA = nullptr;
   FunctionLoweringInfo * FLI = nullptr;
 
   /// The function-level optimization remark emitter.  Used to emit remarks
@@ -382,7 +382,7 @@
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
-            DivergenceAnalysis * Divergence);
+            KernelDivergenceAnalysis * Divergence);
 
   void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
     FLI = FuncInfo;
Index: include/llvm/CodeGen/TargetLowering.h
===================================================================
--- include/llvm/CodeGen/TargetLowering.h
+++ include/llvm/CodeGen/TargetLowering.h
@@ -29,7 +29,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -2648,7 +2648,7 @@
 
   virtual bool isSDNodeSourceOfDivergence(const SDNode *N,
                                           FunctionLoweringInfo *FLI,
-                                          DivergenceAnalysis *DA) const {
+                                          KernelDivergenceAnalysis *DA) const {
     return false;
   }
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -118,6 +118,8 @@
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
 void initializeDetectDeadLanesPass(PassRegistry&);
+void initializeKernelDivergenceAnalysisPass(PassRegistry&);
+void initializeLoopDivergencePrinterPass(PassRegistry&);
 void initializeDivRemPairsLegacyPassPass(PassRegistry&);
 void initializeDivergenceAnalysisPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -94,7 +94,8 @@
       (void) llvm::createDeadInstEliminationPass();
       (void) llvm::createDeadStoreEliminationPass();
       (void) llvm::createDependenceAnalysisWrapperPass();
-      (void) llvm::createDivergenceAnalysisPass();
+      (void) llvm::createKernelDivergenceAnalysisPass();
+      (void) llvm::createLoopDivergencePrinterPass();
       (void) llvm::createDomOnlyPrinterPass();
       (void) llvm::createDomPrinterPass();
       (void) llvm::createDomOnlyViewerPass();
Index: lib/Analysis/Analysis.cpp
===================================================================
--- lib/Analysis/Analysis.cpp
+++ lib/Analysis/Analysis.cpp
@@ -39,7 +39,8 @@
   initializeDependenceAnalysisWrapperPassPass(Registry);
   initializeDelinearizationPass(Registry);
   initializeDemandedBitsWrapperPassPass(Registry);
-  initializeDivergenceAnalysisPass(Registry);
+  initializeKernelDivergenceAnalysisPass(Registry);
+  initializeLoopDivergencePrinterPass(Registry);
   initializeDominanceFrontierWrapperPassPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -38,6 +38,7 @@
   Interval.cpp
   IntervalPartition.cpp
   IteratedDominanceFrontier.cpp
+  KernelDivergenceAnalysis.cpp
   LazyBranchProbabilityInfo.cpp
   LazyBlockFrequencyInfo.cpp
   LazyCallGraph.cpp
@@ -76,6 +77,7 @@
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
+  SyncDependenceAnalysis.cpp
   SyntheticCountsUtils.cpp
   TargetLibraryInfo.cpp
   TargetTransformInfo.cpp
Index: lib/Analysis/DivergenceAnalysis.cpp
===================================================================
--- lib/Analysis/DivergenceAnalysis.cpp
+++ lib/Analysis/DivergenceAnalysis.cpp
@@ -7,9 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements divergence analysis which determines whether a branch
-// in a GPU program is divergent.It can help branch optimizations such as jump
-// threading and loop unswitching to make better decisions.
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines whether a branch in a loop or GPU program
+// is divergent. It can help branch optimizations such as jump threading and
+// loop unswitching to make better decisions.
 //
 // GPU programs typically use the SIMD execution model, where multiple threads
 // in the same execution group have to execute in lock-step. Therefore, if the
@@ -25,16 +26,23 @@
 // optimizations.
 //
 // This file defines divergence analysis which computes a conservative but
-// non-trivial approximation of all divergent branches in a GPU program. It
-// partially implements the approach described in
+// non-trivial approximation of all divergent branches in a GPU program. This
+// implementation is derived from the Vectorization Analysis of the Region
+// Vectorizer (RV). That implementation in turn is based on the approach
+// described in
 //
-//   Divergence Analysis
-//   Sampaio, Souza, Collange, Pereira
-//   TOPLAS '13
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
 //
-// The divergence analysis identifies the sources of divergence (e.g., special
-// variables that hold the thread ID), and recursively marks variables that are
-// data or sync dependent on a source of divergence as divergent.
+// The DivergenceAnalysis implementation is generic in the sense that it doe
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
 //
 // While data dependency is a well-known concept, the notion of sync dependency
 // is worth more explanation. Sync dependence characterizes the control flow
@@ -54,287 +62,476 @@
 // because the branch "br i1 %cond" depends on %tid and affects which value %a
 // is assigned to.
 //
-// The current implementation has the following limitations:
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
 // 1. intra-procedural. It conservatively considers the arguments of a
 //    non-kernel-entry function and the return value of a function call as
 //    divergent.
 // 2. memory as black box. It conservatively considers values loaded from
 //    generic or local address as divergent. This can be improved by leveraging
-//    pointer analysis.
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
+
 using namespace llvm;
 
-#define DEBUG_TYPE "divergence"
-
-namespace {
-
-class DivergencePropagator {
-public:
-  DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
-                       PostDominatorTree &PDT, DenseSet<const Value *> &DV)
-      : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
-  void populateWithSourcesOfDivergence();
-  void propagate();
-
-private:
-  // A helper function that explores data dependents of V.
-  void exploreDataDependency(Value *V);
-  // A helper function that explores sync dependents of TI.
-  void exploreSyncDependency(TerminatorInst *TI);
-  // Computes the influence region from Start to End. This region includes all
-  // basic blocks on any simple path from Start to End.
-  void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End,
-                              DenseSet<BasicBlock *> &InfluenceRegion);
-  // Finds all users of I that are outside the influence region, and add these
-  // users to Worklist.
-  void findUsersOutsideInfluenceRegion(
-      Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion);
-
-  Function &F;
-  TargetTransformInfo &TTI;
-  DominatorTree &DT;
-  PostDominatorTree &PDT;
-  std::vector<Value *> Worklist; // Stack for DFS.
-  DenseSet<const Value *> &DV;   // Stores all divergent values.
-};
-
-void DivergencePropagator::populateWithSourcesOfDivergence() {
-  Worklist.clear();
-  DV.clear();
-  for (auto &I : instructions(F)) {
-    if (TTI.isSourceOfDivergence(&I)) {
-      Worklist.push_back(&I);
-      DV.insert(&I);
+#define DEBUG_TYPE "divergence-analysis"
+
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const Function &F, const Loop *regionLoop, const DominatorTree &DT,
+    const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
+    : F(F), regionLoop(regionLoop), DT(DT), LI(LI), SDA(SDA),
+      IsLCSSAForm(IsLCSSAForm) {}
+
+void DivergenceAnalysis::markDivergent(const Value &divVal) {
+  assert(isa<Instruction>(divVal) || isa<Argument>(divVal));
+  assert(!isAlwaysUniform(divVal) && "can not be a divergent");
+  divergentValues.insert(&divVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const Value &uniVal) {
+  uniformOverrides.insert(&uniVal);
+}
+
+bool DivergenceAnalysis::updateTerminator(const TerminatorInst &term) const {
+  if (term.getNumSuccessors() <= 1)
+    return false;
+  if (auto *branchInst = dyn_cast<BranchInst>(&term)) {
+    assert(branchInst->isConditional());
+    return isDivergent(*branchInst->getCondition());
+  } else if (auto *switchInst = dyn_cast<SwitchInst>(&term)) {
+    return isDivergent(*switchInst->getCondition());
+  } else if (isa<InvokeInst>(term)) {
+    return false; // ignore abnormal executions through landingpad
+  } else {
+    abort();
+  }
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
+  // TODO function calls with side effects, etc
+  for (const auto &op : I.operands()) {
+    if (isDivergent(*op))
+      return true;
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &observingBlock,
+                                             const Value &val) const {
+  const auto *inst = dyn_cast<const Instruction>(&val);
+  if (!inst)
+    return false;
+  const auto *observingLoop = LI.getLoopFor(&observingBlock);
+  for (const auto *loop = LI.getLoopFor(inst->getParent());
+       loop != observingLoop; loop = loop->getParentLoop()) {
+    if (divergentLoops.count(loop))
+      return true;
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updatePHINode(const PHINode &phi) const {
+  // joining divergent disjoint path in @phi parent block
+  if (!phi.hasConstantOrUndefValue() && isJoinDivergent(*phi.getParent())) {
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i*i % 7 == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (size_t i = 0; i < phi.getNumIncomingValues(); ++i) {
+    const auto *inVal = phi.getIncomingValue(i);
+    if (isDivergent(*phi.getIncomingValue(i)) ||
+        isTemporalDivergent(*phi.getParent(), *inVal)) {
+      return true;
     }
   }
-  for (auto &Arg : F.args()) {
-    if (TTI.isSourceOfDivergence(&Arg)) {
-      Worklist.push_back(&Arg);
-      DV.insert(&Arg);
+  return false;
+}
+
+bool DivergenceAnalysis::inRegion(const Instruction &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
+  return (!regionLoop && BB.getParent() == &F) || regionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by @loopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &loopHeader) {
+  auto *divLoop = LI.getLoopFor(&loopHeader);
+  assert(divLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<BasicBlock *, 8> taintStack;
+  divLoop->getExitBlocks(taintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of @divLoop (including its fringes for phi nodes)
+  DenseSet<const BasicBlock *> visited;
+  for (auto *block : taintStack) {
+    visited.insert(block);
+  }
+  visited.insert(&loopHeader);
+
+  while (!taintStack.empty()) {
+    auto *userBlock = taintStack.back();
+    taintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*userBlock))
+      continue;
+
+    assert(!divLoop->contains(userBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&loopHeader, userBlock)) {
+      // all PHI nodes of @userBlock become divergent
+      for (auto &blockInst : *userBlock) {
+        if (!isa<PHINode>(blockInst))
+          break;
+        worklist.push_back(&blockInst);
+      }
+      continue;
+    }
+
+    // taint outside users of values carried by divLoop
+    for (auto &I : *userBlock) {
+      if (isAlwaysUniform(I))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.operands()) {
+        auto *opInst = dyn_cast<Instruction>(&Op);
+        if (!opInst)
+          continue;
+        if (divLoop->contains(opInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *succBlock : successors(userBlock)) {
+      if (!visited.insert(userBlock).second)
+        continue;
+      taintStack.push_back(succBlock);
     }
   }
 }
 
-void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
-  // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's
-  // immediate post dominator are divergent. This rule handles if-then-else
-  // patterns. For example,
-  //
-  // if (tid < 5)
-  //   a1 = 1;
-  // else
-  //   a2 = 2;
-  // a = phi(a1, a2); // sync dependent on (tid < 5)
-  BasicBlock *ThisBB = TI->getParent();
-
-  // Unreachable blocks may not be in the dominator tree.
-  if (!DT.isReachableFromEntry(ThisBB))
-    return;
+void DivergenceAnalysis::pushPHINodes(const BasicBlock &block) {
+  for (const auto &inst : block) {
+    if (!isa<PHINode>(inst))
+      continue;
+    if (isDivergent(inst))
+      continue;
+    worklist.push_back(&inst);
+  }
+}
 
-  // If the function has no exit blocks or doesn't reach any exit blocks, the
-  // post dominator may be null.
-  DomTreeNode *ThisNode = PDT.getNode(ThisBB);
-  if (!ThisNode)
-    return;
+void DivergenceAnalysis::pushUsers(const Value &V) {
+  for (const auto *user : V.users()) {
+    const auto *userInst = dyn_cast<const Instruction>(user);
+    if (!userInst)
+      continue;
 
-  BasicBlock *IPostDom = ThisNode->getIDom()->getBlock();
-  if (IPostDom == nullptr)
-    return;
+    if (isDivergent(*userInst))
+      continue;
 
-  for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) {
-    // A PHINode is uniform if it returns the same value no matter which path is
-    // taken.
-    if (!cast<PHINode>(I)->hasConstantOrUndefValue() && DV.insert(&*I).second)
-      Worklist.push_back(&*I);
+    // only compute divergent inside loop
+    if (!inRegion(*userInst))
+      continue;
+    worklist.push_back(userInst);
   }
+}
 
-  // Propagation rule 2: if a value defined in a loop is used outside, the user
-  // is sync dependent on the condition of the loop exits that dominate the
-  // user. For example,
-  //
-  // int i = 0;
-  // do {
-  //   i++;
-  //   if (foo(i)) ... // uniform
-  // } while (i < tid);
-  // if (bar(i)) ...   // divergent
-  //
-  // A program may contain unstructured loops. Therefore, we cannot leverage
-  // LoopInfo, which only recognizes natural loops.
-  //
-  // The algorithm used here handles both natural and unstructured loops.  Given
-  // a branch TI, we first compute its influence region, the union of all simple
-  // paths from TI to its immediate post dominator (IPostDom). Then, we search
-  // for all the values defined in the influence region but used outside. All
-  // these users are sync dependent on TI.
-  DenseSet<BasicBlock *> InfluenceRegion;
-  computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion);
-  // An insight that can speed up the search process is that all the in-region
-  // values that are used outside must dominate TI. Therefore, instead of
-  // searching every basic blocks in the influence region, we search all the
-  // dominators of TI until it is outside the influence region.
-  BasicBlock *InfluencedBB = ThisBB;
-  while (InfluenceRegion.count(InfluencedBB)) {
-    for (auto &I : *InfluencedBB)
-      findUsersOutsideInfluenceRegion(I, InfluenceRegion);
-    DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom();
-    if (IDomNode == nullptr)
-      break;
-    InfluencedBB = IDomNode->getBlock();
+bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &joinBlock,
+                                                 const Loop *branchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << joinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(joinBlock))
+    return false;
+
+  // push non-divergent phi nodes in @joinBlock to the worklist
+  pushPHINodes(joinBlock);
+
+  // @joinBlock is a divergent loop exit
+  if (branchLoop && !branchLoop->contains(&joinBlock)) {
+    return true;
+
+  } else {
+    // disjoint-paths divergente at @joinBlock
+    markBlockJoinDivergent(joinBlock);
+    return false;
   }
 }
 
-void DivergencePropagator::findUsersOutsideInfluenceRegion(
-    Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) {
-  for (User *U : I.users()) {
-    Instruction *UserInst = cast<Instruction>(U);
-    if (!InfluenceRegion.count(UserInst->getParent())) {
-      if (DV.insert(UserInst).second)
-        Worklist.push_back(UserInst);
+void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << term.getParent()->getName() << "\n");
+
+  markDivergent(term);
+
+  const auto *branchLoop = LI.getLoopFor(term.getParent());
+
+  // whether there is a divergent loop exit from @branchLoop (if any)
+  bool isBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from @term within the loop
+  // also iterates over loop exits that become divergent due to @term.
+  for (const auto *joinBlock : SDA.join_blocks(term)) {
+    isBranchLoopDivergent |= propagateJoinDivergence(*joinBlock, branchLoop);
+  }
+
+  // @branch loop is a divergent loop due to the divergent branch in @term
+  if (isBranchLoopDivergent) {
+    assert(branchLoop);
+    if (!divergentLoops.insert(branchLoop).second) {
+      return;
     }
+    propagateLoopDivergence(*branchLoop);
   }
 }
 
-// A helper function for computeInfluenceRegion that adds successors of "ThisBB"
-// to the influence region.
-static void
-addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End,
-                               DenseSet<BasicBlock *> &InfluenceRegion,
-                               std::vector<BasicBlock *> &InfluenceStack) {
-  for (BasicBlock *Succ : successors(ThisBB)) {
-    if (Succ != End && InfluenceRegion.insert(Succ).second)
-      InfluenceStack.push_back(Succ);
+void DivergenceAnalysis::propagateLoopDivergence(const Loop &exitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << exitingLoop.getName() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*exitingLoop.getHeader()))
+    return;
+
+  const auto *branchLoop = exitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*exitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from @branchLoop (if any)
+  bool isBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // @exitingLoop also iterates over loop exits (of @branchLoop) that in turn
+  // become divergent.
+  for (const auto *joinBlock : SDA.join_blocks(exitingLoop)) {
+    isBranchLoopDivergent |= propagateJoinDivergence(*joinBlock, branchLoop);
+  }
+
+  // @branch loop is a divergent due to divergent loop exit in @exitingLoop
+  if (isBranchLoopDivergent) {
+    assert(branchLoop);
+    if (!divergentLoops.insert(branchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*branchLoop);
   }
 }
 
-void DivergencePropagator::computeInfluenceRegion(
-    BasicBlock *Start, BasicBlock *End,
-    DenseSet<BasicBlock *> &InfluenceRegion) {
-  assert(PDT.properlyDominates(End, Start) &&
-         "End does not properly dominate Start");
-
-  // The influence region starts from the end of "Start" to the beginning of
-  // "End". Therefore, "Start" should not be in the region unless "Start" is in
-  // a loop that doesn't contain "End".
-  std::vector<BasicBlock *> InfluenceStack;
-  addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack);
-  while (!InfluenceStack.empty()) {
-    BasicBlock *BB = InfluenceStack.back();
-    InfluenceStack.pop_back();
-    addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack);
+void DivergenceAnalysis::compute() {
+  for (auto *divVal : divergentValues) {
+    pushUsers(*divVal);
+  }
+
+  // propagate divergence
+  while (!worklist.empty()) {
+    const Instruction &I = *worklist.back();
+    worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniform(I))
+      continue;
+
+    bool wasDivergent = isDivergent(I);
+    if (wasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (isa<TerminatorInst>(I)) {
+      auto &term = cast<TerminatorInst>(I);
+      if (updateTerminator(term)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(term);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool divergentUpd = false;
+    if (isa<PHINode>(I)) {
+      divergentUpd = updatePHINode(cast<PHINode>(I));
+    } else {
+      divergentUpd = updateNormalInstruction(I);
+    }
+
+    // propagate value divergence to users
+    if (divergentUpd) {
+      markDivergent(I);
+      pushUsers(I);
+    }
   }
 }
 
-void DivergencePropagator::exploreDataDependency(Value *V) {
-  // Follow def-use chains of V.
-  for (User *U : V->users()) {
-    Instruction *UserInst = cast<Instruction>(U);
-    if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second)
-      Worklist.push_back(UserInst);
+bool DivergenceAnalysis::isAlwaysUniform(const Value &val) const {
+  return uniformOverrides.count(&val);
+}
+
+bool DivergenceAnalysis::isDivergent(const Value &val) const {
+  return divergentValues.count(&val);
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
+  if (divergentValues.empty())
+    return;
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &I : instructions(F)) {
+    if (divergentValues.count(&I))
+      OS << "DIVERGENT:" << I << "\n";
   }
 }
 
-void DivergencePropagator::propagate() {
-  // Traverse the dependency graph using DFS.
-  while (!Worklist.empty()) {
-    Value *V = Worklist.back();
-    Worklist.pop_back();
-    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) {
-      // Terminators with less than two successors won't introduce sync
-      // dependency. Ignore them.
-      if (TI->getNumSuccessors() > 1)
-        exploreSyncDependency(TI);
-    }
-    exploreDataDependency(V);
+// class LoopDivergenceAnalysis
+LoopDivergenceAnalysis::LoopDivergenceAnalysis(const DominatorTree &DT,
+                                               const LoopInfo &LI,
+                                               SyncDependenceAnalysis &SDA,
+                                               const Loop &loop)
+    : DA(*loop.getHeader()->getParent(), &loop, DT, LI, SDA, true) {
+  for (const auto &I : *loop.getHeader()) {
+    if (!isa<PHINode>(I))
+      break;
+    DA.markDivergent(I);
   }
+
+  // after the scalar remainder loop is extracted, the loop exit condition will
+  // be uniform
+  auto loopExitingInst = loop.getExitingBlock()->getTerminator();
+  auto loopExitCond = cast<BranchInst>(loopExitingInst)->getCondition();
+  DA.addUniformOverride(*loopExitCond);
+
+  DA.compute();
 }
 
-} /// end namespace anonymous
+bool LoopDivergenceAnalysis::isDivergent(const Value &val) const {
+  return DA.isDivergent(val);
+}
+
+void LoopDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
+  OS << "Divergence of loop " << DA.getRegionLoop()->getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
+}
+
+// class LoopDivergencePrinter
+bool LoopDivergencePrinter::runOnFunction(Function &F) {
+  const PostDominatorTree &PDT =
+      getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  const DominatorTree &DT =
+      getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SDA = make_unique<SyncDependenceAnalysis>(DT, PDT, LI);
+
+  for (auto &BB : F) {
+    auto *loop = LI.getLoopFor(&BB);
+    if (!loop || loop->getHeader() != &BB)
+      continue;
+    loopDivInfo.push_back(
+        make_unique<LoopDivergenceAnalysis>(DT, LI, *SDA, *loop));
+  }
+
+  return false;
+}
+
+void LoopDivergencePrinter::print(raw_ostream &OS, const Module *mod) const {
+  for (auto &divInfo : loopDivInfo) {
+    divInfo->print(OS, mod);
+  }
+}
 
 // Register this pass.
-char DivergenceAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
-                      false, true)
+char LoopDivergencePrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDivergencePrinter, "loop-divergence",
+                      "Loop Divergence Printer", false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
-                    false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopDivergencePrinter, "loop-divergence",
+                    "Loop Divergence Printer", false, true)
 
-FunctionPass *llvm::createDivergenceAnalysisPass() {
-  return new DivergenceAnalysis();
+FunctionPass *llvm::createLoopDivergencePrinterPass() {
+  return new LoopDivergencePrinter();
 }
 
-void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+void LoopDivergencePrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
-bool DivergenceAnalysis::runOnFunction(Function &F) {
-  auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
-  if (TTIWP == nullptr)
-    return false;
-
-  TargetTransformInfo &TTI = TTIWP->getTTI(F);
-  // Fast path: if the target does not have branch divergence, we do not mark
-  // any branch as divergent.
-  if (!TTI.hasBranchDivergence())
-    return false;
+// class GPUDivergenceAnalysis
+GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
+                                             const DominatorTree &DT,
+                                             const PostDominatorTree &PDT,
+                                             const LoopInfo &LI,
+                                             const TargetTransformInfo &TTI)
+    : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
+  for (auto &I : instructions(F)) {
+    if (TTI.isSourceOfDivergence(&I)) {
+      DA.markDivergent(I);
+    } else if (TTI.isAlwaysUniform(&I)) {
+      DA.addUniformOverride(I);
+    }
+  }
+  for (auto &Arg : F.args()) {
+    if (TTI.isSourceOfDivergence(&Arg)) {
+      DA.markDivergent(Arg);
+    }
+  }
 
-  DivergentValues.clear();
-  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  DivergencePropagator DP(F, TTI,
-                          getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          PDT, DivergentValues);
-  DP.populateWithSourcesOfDivergence();
-  DP.propagate();
-  LLVM_DEBUG(
-    dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
-    print(dbgs(), F.getParent())
-  );
-  return false;
+  DA.compute();
 }
 
-void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
-  if (DivergentValues.empty())
-    return;
-  const Value *FirstDivergentValue = *DivergentValues.begin();
-  const Function *F;
-  if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
-    F = Arg->getParent();
-  } else if (const Instruction *I =
-                 dyn_cast<Instruction>(FirstDivergentValue)) {
-    F = I->getParent()->getParent();
-  } else {
-    llvm_unreachable("Only arguments and instructions can be divergent");
-  }
+bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
+  return DA.isDivergent(val);
+}
 
-  // Dumps all divergent values in F, arguments and then instructions.
-  for (auto &Arg : F->args()) {
-    OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : "           ");
-    OS << Arg << "\n";
-  }
-  // Iterate instructions using instructions() to ensure a deterministic order.
-  for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) {
-    auto &BB = *BI;
-    OS << "\n           " << BB.getName() << ":\n";
-    for (auto &I : BB.instructionsWithoutDebug()) {
-      OS << (DivergentValues.count(&I) ? "DIVERGENT:     " : "               ");
-      OS << I << "\n";
-    }
-  }
-  OS << "\n";
+void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
+  OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
+  DA.print(OS, mod);
+  OS << "}\n";
 }
Index: lib/Analysis/KernelDivergenceAnalysis.cpp
===================================================================
--- lib/Analysis/KernelDivergenceAnalysis.cpp
+++ lib/Analysis/KernelDivergenceAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
+//===- KernelDivergenceAnalysis.cpp --------- Kernel Divergence Analysis Implementation -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -65,6 +65,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -79,6 +80,12 @@
 
 #define DEBUG_TYPE "divergence"
 
+// transparently use the GPUDivergenceAnalysis
+static cl::opt<bool> UseGPUDA(
+    "use-gpu-da", cl::init(false), cl::Hidden,
+    cl::desc(
+        "turn the KernelDivergenceAnalysis into a wrapper for GPUDivergenceAnalysis"));
+
 namespace {
 
 class DivergencePropagator {
@@ -265,25 +272,27 @@
 } /// end namespace anonymous
 
 // Register this pass.
-char DivergenceAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
+char KernelDivergenceAnalysis::ID = 0;
+INITIALIZE_PASS_BEGIN(KernelDivergenceAnalysis, "divergence", "Kernel Divergence Analysis",
                       false, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(KernelDivergenceAnalysis, "divergence", "Kernel Divergence Analysis",
                     false, true)
 
-FunctionPass *llvm::createDivergenceAnalysisPass() {
-  return new DivergenceAnalysis();
+FunctionPass *llvm::createKernelDivergenceAnalysisPass() {
+  return new KernelDivergenceAnalysis();
 }
 
-void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+void KernelDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
+  if (UseGPUDA) AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
-bool DivergenceAnalysis::runOnFunction(Function &F) {
+bool KernelDivergenceAnalysis::runOnFunction(Function &F) {
   auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
   if (TTIWP == nullptr)
     return false;
@@ -295,36 +304,60 @@
     return false;
 
   DivergentValues.clear();
+  gpuDA = nullptr;
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  DivergencePropagator DP(F, TTI,
-                          getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                          PDT, DivergentValues);
-  DP.populateWithSourcesOfDivergence();
-  DP.propagate();
+
+  if (UseGPUDA) {
+    // run the new GPU divergence analysis
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
+
+  } else {
+    // run LLVM's existing DivergenceAnalysis
+    DivergencePropagator DP(F, TTI,
+                            DT,
+                            PDT, DivergentValues);
+    DP.populateWithSourcesOfDivergence();
+    DP.propagate();
+  }
+
   LLVM_DEBUG(
     dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
     print(dbgs(), F.getParent())
   );
+
   return false;
 }
 
-void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
-  if (DivergentValues.empty())
+bool KernelDivergenceAnalysis::isDivergent(const Value *V) const {
+  if (gpuDA) return gpuDA->isDivergent(*V);
+  else return DivergentValues.count(V);
+}
+
+void KernelDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
+  if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
     return;
-  const Value *FirstDivergentValue = *DivergentValues.begin();
+
   const Function *F;
-  if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
-    F = Arg->getParent();
-  } else if (const Instruction *I =
-                 dyn_cast<Instruction>(FirstDivergentValue)) {
-    F = I->getParent()->getParent();
-  } else {
-    llvm_unreachable("Only arguments and instructions can be divergent");
+  if (!DivergentValues.empty()) {
+    const Value *FirstDivergentValue = *DivergentValues.begin();
+    if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
+      F = Arg->getParent();
+    } else if (const Instruction *I =
+                   dyn_cast<Instruction>(FirstDivergentValue)) {
+      F = I->getParent()->getParent();
+    } else {
+      llvm_unreachable("Only arguments and instructions can be divergent");
+    }
+  } else if (gpuDA) {
+    F = &gpuDA->getFunction();
   }
 
   // Dumps all divergent values in F, arguments and then instructions.
   for (auto &Arg : F->args()) {
-    OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : "           ");
+    OS << (isDivergent(&Arg) ? "DIVERGENT: " : "           ");
     OS << Arg << "\n";
   }
   // Iterate instructions using instructions() to ensure a deterministic order.
@@ -332,7 +365,7 @@
     auto &BB = *BI;
     OS << "\n           " << BB.getName() << ":\n";
     for (auto &I : BB.instructionsWithoutDebug()) {
-      OS << (DivergentValues.count(&I) ? "DIVERGENT:     " : "               ");
+      OS << (isDivergent(&I) ? "DIVERGENT:     " : "               ");
       OS << I << "\n";
     }
   }
Index: lib/Analysis/SyncDependenceAnalysis.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/SyncDependenceAnalysis.cpp
@@ -0,0 +1,382 @@
+//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation
+//--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::emptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
+                                               const PostDominatorTree &PDT,
+                                               const LoopInfo &LI)
+    : funcRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {
+  for (auto it : cachedBranchJoins) {
+    delete it.second;
+  }
+  for (auto it : cachedLoopExitJoins) {
+    delete it.second;
+  }
+}
+
+using FunctionRPOT = ReversePostOrderTraversal<const Function *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &funcRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  // identified join points
+  ConstBlockSet *const joinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const BasicBlock *, 4> reachedLoopExits;
+
+  // if defMap[B] == C then C is the dominating definition at block B
+  // if defMap[B] ~ undef then we haven't seen B yet
+  // if defMap[B] == B then B is a join point of disjoint paths from X
+  using DefMap = std::map<const BasicBlock *, const BasicBlock *>;
+  DefMap defMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const BasicBlock *> pendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &funcRPOT, const DominatorTree &DT,
+                       const PostDominatorTree &PDT, const LoopInfo &LI)
+      : funcRPOT(funcRPOT), DT(DT), PDT(PDT), LI(LI),
+        joinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const BasicBlock &block, const BasicBlock &defBlock) {
+    bool wasAdded = defMap.emplace(&block, &defBlock).second;
+    if (wasAdded)
+      pendingUpdates.insert(&block);
+  }
+
+  void printDefs(raw_ostream &out) {
+    out << "Propagator::defMap {\n";
+    for (const auto *block : funcRPOT) {
+      auto it = defMap.find(block);
+      out << block->getName() << " : ";
+      if (it == defMap.end()) {
+        out << "\n";
+      } else {
+        const auto *defBlock = it->second;
+        out << (defBlock ? defBlock->getName() : "<null>") << "\n";
+      }
+    }
+    out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const BasicBlock &succBlock, const Loop *parentLoop,
+                      const BasicBlock &defBlock) {
+
+    // @succBlock is a loop exit
+    if (parentLoop && !parentLoop->contains(&succBlock)) {
+      defMap.emplace(&succBlock, &defBlock);
+      reachedLoopExits.insert(&succBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto itLastDef = defMap.find(&succBlock);
+    if (itLastDef == defMap.end()) {
+      addPending(succBlock, defBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (itLastDef->second != &defBlock) {
+      // do we know this join already?
+      if (!joinBlocks->insert(&succBlock).second)
+        return;
+
+      // update the definition
+      addPending(succBlock, succBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent TerminatorInsts and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (Loop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the Loop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  ConstBlockSet *computeJoinPoints(const BasicBlock &rootBlock,
+                                   SuccessorIterable nodeSuccessors,
+                                   const Loop *parentLoop) {
+    // immediate post dominator (no join block beyond that block)
+    const auto *pdNode = PDT.getNode(const_cast<BasicBlock *>(&rootBlock));
+    const auto *ipdNode = pdNode->getIDom();
+    const auto *pdBoundBlock = ipdNode ? ipdNode->getBlock() : nullptr;
+
+    // bootstrap with branch targets
+    for (const auto *succBlock : nodeSuccessors) {
+      defMap.emplace(succBlock, succBlock);
+
+      if (parentLoop && !parentLoop->contains(succBlock)) {
+        // immediate loop exit from node.
+        reachedLoopExits.insert(succBlock);
+        continue;
+      } else {
+        // regular successor
+        pendingUpdates.insert(succBlock);
+      }
+    }
+
+    auto itBeginRPO = funcRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *itBeginRPO != &rootBlock; ++itBeginRPO) {
+    }
+
+    auto itEndRPO = funcRPOT.end();
+    assert(itBeginRPO != itEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto itBlockRPO = itBeginRPO;
+    while (++itBlockRPO != itEndRPO && *itBlockRPO != pdBoundBlock) {
+      const auto *block = *itBlockRPO;
+
+      // skip @block if not pending update
+      auto itPending = pendingUpdates.find(block);
+      if (itPending == pendingUpdates.end())
+        continue;
+      pendingUpdates.erase(itPending);
+
+      // propagate definition at @block to its successors
+      auto itDef = defMap.find(block);
+      const auto *defBlock = itDef->second;
+      assert(defBlock);
+
+      auto *blockLoop = LI.getLoopFor(block);
+      if (parentLoop &&
+          (parentLoop != blockLoop && parentLoop->contains(blockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<BasicBlock *, 4> blockLoopExits;
+        blockLoop->getExitBlocks(blockLoopExits);
+        for (const auto *blockLoopExit : blockLoopExits) {
+          visitSuccessor(*blockLoopExit, parentLoop, *defBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *succBlock : successors(block)) {
+          visitSuccessor(*succBlock, parentLoop, *defBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const BasicBlock *parentLoopHeader =
+        parentLoop ? parentLoop->getHeader() : nullptr;
+    if (parentLoop && parentLoop->contains(pdBoundBlock)) {
+      defMap[parentLoopHeader] = defMap[pdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!reachedLoopExits.empty()) {
+      assert(parentLoop);
+      const auto *headerDefBlock = defMap[parentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+      assert(headerDefBlock && "no definition in header of carrying loop");
+
+      for (const auto *exitBlock : reachedLoopExits) {
+        assert((defMap[exitBlock] != nullptr) &&
+               "no reaching def at loop exit");
+        if (defMap[exitBlock] != headerDefBlock) {
+          joinBlocks->insert(exitBlock);
+        }
+      }
+    }
+
+    return joinBlocks;
+  }
+};
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &loop) {
+  using LoopExitVec = SmallVector<BasicBlock *, 4>;
+  LoopExitVec loopExits;
+  loop.getExitBlocks(loopExits);
+  if (loopExits.size() < 1) {
+    return emptyBlockSet;
+  }
+
+  // already available in cache?
+  auto it = cachedLoopExitJoins.find(&loop);
+  if (it != cachedLoopExitJoins.end())
+    return *it->second;
+
+  // compute all join points
+  DivergencePropagator propagator{funcRPOT, DT, PDT, LI};
+  auto *joinBlocks = propagator.computeJoinPoints<const LoopExitVec &>(
+      *loop.getHeader(), loopExits, loop.getParentLoop());
+
+  cachedLoopExitJoins[&loop] = joinBlocks;
+  return *joinBlocks;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const TerminatorInst &term) {
+  // trivial case
+  if (term.getNumSuccessors() < 1) {
+    return emptyBlockSet;
+  }
+
+  // already available in cache?
+  auto it = cachedBranchJoins.find(&term);
+  if (it != cachedBranchJoins.end())
+    return *it->second;
+
+  // compute all join points
+  DivergencePropagator propagator{funcRPOT, DT, PDT, LI};
+  const auto &termBlock = *term.getParent();
+  auto *joinBlocks = propagator.computeJoinPoints<succ_const_range>(
+      termBlock, successors(term.getParent()), LI.getLoopFor(&termBlock));
+
+  cachedBranchJoins[&term] = joinBlocks;
+  return *joinBlocks;
+}
+
+} // namespace llvm
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -984,7 +984,7 @@
 void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE,
                         Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
-                        DivergenceAnalysis * Divergence) {
+                        KernelDivergenceAnalysis * Divergence) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -417,7 +417,7 @@
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
   CurDAG->init(*MF, *ORE, this, LibInfo,
-   getAnalysisIfAvailable<DivergenceAnalysis>());
+   getAnalysisIfAvailable<KernelDivergenceAnalysis>());
   FuncInfo->set(Fn, *MF, CurDAG);
 
   // Now get the optional analyzes if we want to.
Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -16,7 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/IR/IRBuilder.h"
@@ -32,7 +32,7 @@
 
 class AMDGPUAnnotateUniformValues : public FunctionPass,
                        public InstVisitor<AMDGPUAnnotateUniformValues> {
-  DivergenceAnalysis *DA;
+  KernelDivergenceAnalysis *DA;
   MemoryDependenceResults *MDR;
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
@@ -49,7 +49,7 @@
     return "AMDGPU Annotate Uniform Values";
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<KernelDivergenceAnalysis>();
     AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.setPreservesAll();
@@ -64,7 +64,7 @@
 
 INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
                       "Add AMDGPU uniform metadata", false, false)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
@@ -176,7 +176,7 @@
   if (skipFunction(F))
     return false;
 
-  DA  = &getAnalysis<DivergenceAnalysis>();
+  DA  = &getAnalysis<KernelDivergenceAnalysis>();
   MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
   LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -18,7 +18,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
@@ -60,7 +60,7 @@
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
   AssumptionCache *AC = nullptr;
-  DivergenceAnalysis *DA = nullptr;
+  KernelDivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   bool HasUnsafeFPMath = false;
   AMDGPUAS AMDGPUASI;
@@ -177,7 +177,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<KernelDivergenceAnalysis>();
     AU.setPreservesAll();
  }
 };
@@ -898,7 +898,7 @@
   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DA = &getAnalysis<DivergenceAnalysis>();
+  DA = &getAnalysis<KernelDivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
   AMDGPUASI = TM.getAMDGPUAS();
 
@@ -918,7 +918,7 @@
 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
 
Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -29,7 +29,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -87,7 +87,7 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AMDGPUArgumentUsageInfo>();
     AU.addRequired<AMDGPUPerfHintAnalysis>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<KernelDivergenceAnalysis>();
     SelectionDAGISel::getAnalysisUsage(AU);
   }
 
@@ -257,7 +257,7 @@
                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
 
Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -16,7 +16,6 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
Index: lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -25,7 +25,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -70,7 +70,7 @@
 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                      "Unify divergent function exit nodes", false, false)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
                     "Unify divergent function exit nodes", false, false)
 
@@ -78,10 +78,10 @@
   // TODO: Preserve dominator tree.
   AU.addRequired<PostDominatorTreeWrapperPass>();
 
-  AU.addRequired<DivergenceAnalysis>();
+  AU.addRequired<KernelDivergenceAnalysis>();
 
   // No divergent values are changed, only blocks and branch edges.
-  AU.addPreserved<DivergenceAnalysis>();
+  AU.addPreserved<KernelDivergenceAnalysis>();
 
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
@@ -95,7 +95,7 @@
 
 /// \returns true if \p BB is reachable through only uniform branches.
 /// XXX - Is there a more efficient way to find this?
-static bool isUniformlyReached(const DivergenceAnalysis &DA,
+static bool isUniformlyReached(const KernelDivergenceAnalysis &DA,
                                BasicBlock &BB) {
   SmallVector<BasicBlock *, 8> Stack;
   SmallPtrSet<BasicBlock *, 8> Visited;
@@ -163,7 +163,7 @@
   if (PDT.getRoots().size() <= 1)
     return false;
 
-  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+  KernelDivergenceAnalysis &DA = getAnalysis<KernelDivergenceAnalysis>();
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
   // return.
Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
@@ -52,7 +52,7 @@
 using StackVector = SmallVector<StackEntry, 16>;
 
 class SIAnnotateControlFlow : public FunctionPass {
-  DivergenceAnalysis *DA;
+  KernelDivergenceAnalysis *DA;
 
   Type *Boolean;
   Type *Void;
@@ -116,7 +116,7 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<KernelDivergenceAnalysis>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
@@ -127,7 +127,7 @@
 INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
 
@@ -386,7 +386,7 @@
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DA = &getAnalysis<DivergenceAnalysis>();
+  DA = &getAnalysis<KernelDivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -324,7 +324,7 @@
                                      unsigned Depth = 0) const override;
 
   bool isSDNodeSourceOfDivergence(const SDNode *N,
-    FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
+    FunctionLoweringInfo *FLI, KernelDivergenceAnalysis *DA) const override;
 
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8486,7 +8486,7 @@
 }
 
 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
-  FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+  FunctionLoweringInfo * FLI, KernelDivergenceAnalysis * KDA) const
 {
   switch (N->getOpcode()) {
     case ISD::Register:
@@ -8519,7 +8519,7 @@
           else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
             return true;
         }
-        return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+        return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg));
       }
     }
     break;
Index: lib/Transforms/Scalar/LoopUnswitch.cpp
===================================================================
--- lib/Transforms/Scalar/LoopUnswitch.cpp
+++ lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -33,7 +33,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -215,7 +215,7 @@
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
       if (hasBranchDivergence)
-        AU.addRequired<DivergenceAnalysis>();
+        AU.addRequired<KernelDivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
@@ -383,7 +383,7 @@
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
@@ -864,7 +864,7 @@
     return false;
   }
   if (hasBranchDivergence &&
-      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+      getAnalysis<KernelDivergenceAnalysis>().isDivergent(LoopCond)) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
                       << currentLoop->getHeader()->getName()
                       << " at non-trivial condition '" << *Val
Index: lib/Transforms/Scalar/StructurizeCFG.cpp
===================================================================
--- lib/Transforms/Scalar/StructurizeCFG.cpp
+++ lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -13,7 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/KernelDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
@@ -183,7 +183,7 @@
   Function *Func;
   Region *ParentRegion;
 
-  DivergenceAnalysis *DA;
+  KernelDivergenceAnalysis *DA;
   DominatorTree *DT;
   LoopInfo *LI;
 
@@ -269,7 +269,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     if (SkipUniformRegions)
-      AU.addRequired<DivergenceAnalysis>();
+      AU.addRequired<KernelDivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
@@ -285,7 +285,7 @@
 
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(KernelDivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
@@ -914,7 +914,7 @@
 }
 
 static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
-                                   const DivergenceAnalysis &DA) {
+                                   const KernelDivergenceAnalysis &DA) {
   for (auto E : R->elements()) {
     if (!E->isSubRegion()) {
       auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
@@ -962,7 +962,7 @@
     // but we shouldn't rely on metadata for correctness!
     unsigned UniformMDKindID =
         R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
-    DA = &getAnalysis<DivergenceAnalysis>();
+    DA = &getAnalysis<KernelDivergenceAnalysis>();
 
     if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
       LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
Index: test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/always_uniform.ll
@@ -0,0 +1,14 @@
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s
+
+define amdgpu_kernel void @workitem_id_x() #1 {
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+; CHECK-NOT: DIVERGENT:  %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
 define i32 @test1(i32* %ptr, i32 %val) #0 {
Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_diverge.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s
+
+define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/hidden_loopdiverge.ll
@@ -0,0 +1,224 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the divergent join point in %exit is obscured by uniform control joining in %X
+define amdgpu_kernel void @hidden_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %H ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (H<header><exiting to X>, B<exiting to Y>)
+; the phi nodes in X and Y don't actually receive divergent values
+define amdgpu_kernel void @unobserved_loop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'unobserved_loop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  br i1 %uni.cond, label %X, label %H  ; uniform
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %B ]
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %div.exitx, label %X, label %B ; divergent branch
+; CHECK: DIVERGENT: %div.exitx =  
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+B:
+  %uni.inc = add i32 %uni.merge.h, 1
+  %div.exity = icmp sgt i32 %tid, 0
+  br i1 %div.exity, label %Y, label %H ; divergent branch
+; CHECK: DIVERGENT: %div.exity =  
+; CHECK: DIVERGENT: br i1 %div.exity, 
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %b, %H ] 
+  br label %exit
+
+Y:
+  %uni.merge.y = phi i32 [ %b, %B ]
+  br label %exit
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to D>) inside divergent loop (H<header>, B<exiting to X>, C<exiting to Y>, D, G, L)
+; the inner loop has no exit to top level.
+; the outer loop becomes divergent as its exiting branch in C is control-dependent on the inner loop's divergent loop exit in D.
+define amdgpu_kernel void @hidden_nestedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_nestedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %X, label %C 
+
+C:
+  br i1 %uni.cond, label %Y, label %D
+
+D:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %C, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %D, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %B ] ; temporal divergent phi
+  br i1 %uni.cond, label %Y, label %exit
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %C ]
+  br label %exit
+; CHECK: DIVERGENT: %div.merge.y =
+
+exit:
+  %div.merge.exit = phi i32 [ %a, %X ], [ %b, %Y ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.exit =
+}
+
+; divergent loop (G<header>, L<exiting to X>) in divergent loop (H<header>, B<exiting to C>, C, G, L)
+; the outer loop has no immediately divergent exiting edge.
+; the inner exiting edge is exiting to top-level through the outer loop causing both to become divergent.
+define amdgpu_kernel void @hidden_doublebreak_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_doublebreak_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc, %C ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  br i1 %uni.cond, label %Y, label %C 
+
+C:
+  %uni.inc = add i32 %uni.merge.h, 1
+  br label %H
+
+G:
+  br i1 %div.exitx, label %X, label %L ; two-level break
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  br i1 %uni.cond, label %C, label %G
+
+X:
+  %div.merge.x = phi i32 [ %a, %entry ], [ %uni.merge.h, %G ] ; temporal divergence
+  br label %Y
+; CHECK: DIVERGENT: %div.merge.x =
+
+Y:
+  %div.merge.y = phi i32 [ 42, %X ], [ %b, %B ]
+  ret void
+; CHECK: DIVERGENT: %div.merge.y =
+}
+
+; divergent loop (G<header>, L<exiting to D>) contained inside a uniform loop (H<header>, B, G, L , D<exiting to x>)
+define amdgpu_kernel void @hidden_containedloop_diverge(i32 %n, i32 %a, i32 %b) #0 {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_containedloop_diverge':
+; CHECK-NOT: DIVERGENT: %uni.
+; CHECK-NOT: DIVERGENT: br i1 %uni.
+
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+  %div.exitx = icmp slt i32 %tid, 0
+  br i1 %uni.cond, label %X, label %H
+
+H:
+  %uni.merge.h = phi i32 [ 0, %entry ], [ %uni.inc.d, %D ]
+  br i1 %uni.cond, label %G, label %B
+; CHECK: DIVERGENT: %div.exitx =  
+
+B:
+  %div.merge.b = phi i32 [ 42, %H ], [ %uni.merge.g, %G ]
+  br label %D
+; CHECK: DIVERGENT: %div.merge.b =
+
+G:
+  %uni.merge.g = phi i32 [ 123, %H ], [ %uni.inc.l, %L ]
+  br i1 %div.exitx, label %B, label %L
+; CHECK: DIVERGENT: br i1 %div.exitx, 
+
+L:
+  %uni.inc.l = add i32 %uni.merge.g, 1
+  br i1 %uni.cond, label %G, label %D
+
+D:
+  %uni.inc.d = add i32 %uni.merge.h, 1
+  br i1 %uni.cond, label %X, label %H
+
+X:
+  %uni.merge.x = phi i32 [ %a, %entry ], [ %uni.inc.d, %D ]
+  ret void
+
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
 define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -1,6 +1,6 @@
-; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-da | FileCheck %s
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_ps':
 ; CHECK: DIVERGENT:
 ; CHECK-NOT: %arg0
 ; CHECK-NOT: %arg1
@@ -14,7 +14,7 @@
   ret void
 }
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_kernel':
 ; CHECK-NOT: %arg0
 ; CHECK-NOT: %arg1
 ; CHECK-NOT: %arg2
@@ -26,7 +26,7 @@
   ret void
 }
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_c':
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -1,4 +1,4 @@
-;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap(
 define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
Index: test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -1,4 +1,4 @@
-;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence %s | FileCheck %s
+;RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(
 define float @image_atomic_swap(<8 x i32> inreg %rsrc, i32 inreg %addr, i32 inreg %data) #0 {
Index: test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
+; RUN: opt %s -mtriple amdgcn-- -analyze -divergence -use-gpu-da | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp2
 ; CHECK: DIVERGENT:  %tmp10 = load volatile float, float addrspace(1)* %tmp5, align 4
Index: test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/phi-undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 ; CHECK-LABEL: 'test1':
 ; CHECK-NEXT: DIVERGENT: i32 %bound
Index: test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@@ -1,17 +0,0 @@
-; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
-
-; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
-entry:
-  unreachable
-
-unreachable_loop:                                        ; preds = %do.body.i, %if.then11
-  %tmp = cmpxchg volatile i32 addrspace(1)* null, i32 0, i32 0 seq_cst seq_cst
-  %cmp.i = extractvalue { i32, i1 } %tmp, 1
-  br i1 %cmp.i, label %unreachable_loop, label %end
-
-end:                                      ; preds = %do.body.i51, %atomicAdd_g_f.exit
-  unreachable
-}
-
-attributes #0 = { norecurse nounwind }
Index: test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence %s | FileCheck %s
+; RUN: opt  -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-da %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
Index: test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/IndirectUniAccess.ll
@@ -0,0 +1,74 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ]
+; CHECK-NEXT: DIVERGENT:  %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  store double %x.0.lcssa, double* %arrayidx10, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+; CHECK-NEXT: DIVERGENT:  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  %2 = load double, double* %arrayidx8, align 8
+; CHECK-NEXT: DIVERGENT:  %add = fadd double %x.025, %2
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body4 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %0 = load i32, i32* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %idxprom5 = sext i32 %0 to i64
+; CHECK-NEXT: DIVERGENT:  %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5
+; CHECK-NEXT: DIVERGENT:  %1 = load double*, double** %arrayidx6, align 8
+; CHECK-NEXT: DIVERGENT:  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+; CHECK-NEXT: DIVERGENT:  %2 = load double, double* %arrayidx8, align 8
+; CHECK-NEXT: DIVERGENT:  %add = fadd double %x.025, %2
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+
+; Function Attrs: norecurse nounwind uwtable
+define void @test(i32* nocapture readonly %Index, double** nocapture readonly %A, double* nocapture %C, i32 %m, i32 %n) #0 {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp224 = icmp sgt i32 %m, 0
+  %wide.trip.count = zext i32 %m to i64
+  %wide.trip.count31 = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup3, %for.body.lr.ph
+  %indvars.iv29 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next30, %for.cond.cleanup3 ]
+  br i1 %cmp224, label %for.body4.preheader, label %for.cond.cleanup3
+
+for.body4.preheader:                              ; preds = %for.body
+  br label %for.body4
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.body
+  %x.0.lcssa = phi double [ 0.000000e+00, %for.body ], [ %add, %for.body4 ]
+  %arrayidx10 = getelementptr inbounds double, double* %C, i64 %indvars.iv29
+  store double %x.0.lcssa, double* %arrayidx10, align 8
+  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
+  %exitcond32 = icmp eq i64 %indvars.iv.next30, %wide.trip.count31
+  br i1 %exitcond32, label %for.cond.cleanup, label %for.body
+
+for.body4:                                        ; preds = %for.body4.preheader, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body4.preheader ]
+  %x.025 = phi double [ %add, %for.body4 ], [ 0.000000e+00, %for.body4.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %Index, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom5 = sext i32 %0 to i64
+  %arrayidx6 = getelementptr inbounds double*, double** %A, i64 %idxprom5
+  %1 = load double*, double** %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds double, double* %1, i64 %indvars.iv29
+  %2 = load double, double* %arrayidx8, align 8
+  %add = fadd double %x.025, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+}
+
+attributes #0 = { norecurse nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivBranch.ll
@@ -0,0 +1,44 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+; CHECK-NEXT: DIVERGENT:  %hfreq = srem i64 %indvars.iv, 2
+; CHECK-NEXT: DIVERGENT:  %toggle = trunc i64 %hfreq to i1
+; CHECK-NEXT: DIVERGENT:  br i1 %toggle, label %A, label %B
+; CHECK-NEXT: DIVERGENT:  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %divphi, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+  %hfreq = srem i64 %indvars.iv, 2
+  %toggle = trunc i64 %hfreq to i1
+  br i1 %toggle, label %A, label %B
+
+A:
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  br label %B
+
+B:
+  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+  store float %divphi, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithDivLoop.ll
@@ -0,0 +1,60 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Printing analysis 'Loop Divergence Printer' for function 'test1':
+; CHECK-NEXT: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+; CHECK-NEXT: DIVERGENT:  %row = mul i64 %n, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  br i1 %exitcond2, label %for.latch, label %for.body2
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body2 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+; CHECK-NEXT: }
+
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %exit
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+  br label %for.body2
+
+for.body2:
+  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+  %row = mul i64 %n, %indvars.iv
+  %idx = add i64 %row, %indvars.iv2
+  %trunc = trunc i64 %idx to i32
+  %val = sitofp i32 %trunc to float
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+  store float %val, float* %arrayidx, align 4
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+  %exitcond2 = icmp sge i64 %indvars.iv.next2, %indvars.iv
+  br i1 %exitcond2, label %for.latch, label %for.body2
+
+for.latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+}
+
+attributes #0 = { nounwind }
+
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithLI.ll
@@ -0,0 +1,31 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %cast, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %A, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  store float %cast, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniBranch.ll
@@ -0,0 +1,39 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float %divphi, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %B ]
+  %invar = trunc i64 %n to i1
+  br i1 %invar, label %A, label %B
+
+A:
+  %trunc = trunc i64 %n to i32
+  %cast = sitofp i32 %trunc to float
+  br label %B
+
+B:
+  %divphi = phi float [ %cast, %A ], [ 4.200000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %indvars.iv
+  store float %divphi, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/LoopWithUniLoop.ll
@@ -0,0 +1,55 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+; CHECK-NEXT: DIVERGENT:  %row = mul i64 %n, %indvars.iv
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body2 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+; CHECK-NEXT: DIVERGENT:  %idx = add i64 %row, %indvars.iv2
+; CHECK-NEXT: DIVERGENT:  %trunc = trunc i64 %idx to i32
+; CHECK-NEXT: DIVERGENT:  %val = sitofp i32 %trunc to float
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+; CHECK-NEXT: DIVERGENT:  store float %val, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %ptr, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %exit
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.latch ]
+  br label %for.body2
+
+for.body2:
+  %indvars.iv2 = phi i64 [ 0, %for.body ], [ %indvars.iv.next2, %for.body2 ]
+  %row = mul i64 %n, %indvars.iv
+  %idx = add i64 %row, %indvars.iv2
+  %trunc = trunc i64 %idx to i32
+  %val = sitofp i32 %trunc to float
+  %arrayidx = getelementptr inbounds float, float* %ptr, i64 %idx
+  store float %val, float* %arrayidx, align 4
+  %indvars.iv.next2 = add nuw nsw i64 %indvars.iv2, 1
+  %exitcond2 = icmp eq i64 %indvars.iv.next2, %n
+  br i1 %exitcond2, label %for.latch, label %for.body2
+
+for.latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/NonAffineUniLoop.ll
@@ -0,0 +1,110 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ]
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body8.lr.ph {
+; CHECK-NEXT: DIVERGENT:  %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ]
+; CHECK-NEXT: DIVERGENT:  %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ]
+; CHECK-NEXT: DIVERGENT:  %1 = sext i32 %mul44 to i64
+; CHECK-NEXT: DIVERGENT:  %2 = sext i32 %len.043 to i64
+; CHECK-NEXT: DIVERGENT:  %mul = shl nsw i32 %mul44, 1
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+; CHECK-NEXT: DIVERGENT:  %3 = add nsw i64 %indvars.iv, %2
+; CHECK-NEXT: DIVERGENT:  %4 = mul nsw i64 %3, %0
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %7 = mul nsw i64 %indvars.iv, %0
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add i64 %indvars.iv, %1
+; CHECK-NEXT: DIVERGENT:  %cmp6 = icmp slt i64 %indvars.iv.next, %0
+; CHECK-NEXT: DIVERGENT:  br i1 %cmp6, label %for.body8, label %for.cond.cleanup7
+; CHECK-NEXT: }
+; CHECK: Divergence of loop for.body8 {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+; CHECK-NEXT: DIVERGENT:  %3 = add nsw i64 %indvars.iv, %2
+; CHECK-NEXT: DIVERGENT:  %4 = mul nsw i64 %3, %0
+; CHECK-NEXT: DIVERGENT:  %5 = add nsw i64 %4, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+; CHECK-NEXT: DIVERGENT:  %6 = load double, double* %arrayidx, align 8
+; CHECK-NEXT: DIVERGENT:  %7 = mul nsw i64 %indvars.iv, %0
+; CHECK-NEXT: DIVERGENT:  %8 = add nsw i64 %7, %indvars.iv53
+; CHECK-NEXT: DIVERGENT:  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+; CHECK-NEXT: DIVERGENT:  %9 = load double, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %add15 = fadd double %6, %9
+; CHECK-NEXT: DIVERGENT:  store double %add15, double* %arrayidx14, align 8
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add i64 %indvars.iv, %1
+; CHECK-NEXT: }
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(double* nocapture %A, i32 %n) local_unnamed_addr #0 {
+entry:
+  %cmp45 = icmp sgt i32 %n, 0
+  br i1 %cmp45, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp242 = icmp sgt i32 %n, 2
+  %0 = sext i32 %n to i64
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup3, %for.body.lr.ph
+  %indvars.iv53 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next54, %for.cond.cleanup3 ]
+  br i1 %cmp242, label %for.body8.lr.ph.preheader, label %for.cond.cleanup3
+
+for.body8.lr.ph.preheader:                        ; preds = %for.body
+  br label %for.body8.lr.ph
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7, %for.body
+  %indvars.iv.next54 = add nuw nsw i64 %indvars.iv53, 1
+  %exitcond = icmp eq i64 %indvars.iv.next54, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.body8.lr.ph:                                  ; preds = %for.body8.lr.ph.preheader, %for.cond.cleanup7
+  %mul44 = phi i32 [ %mul, %for.cond.cleanup7 ], [ 2, %for.body8.lr.ph.preheader ]
+  %len.043 = phi i32 [ %mul44, %for.cond.cleanup7 ], [ 1, %for.body8.lr.ph.preheader ]
+  %1 = sext i32 %mul44 to i64
+  %2 = sext i32 %len.043 to i64
+  br label %for.body8
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %mul = shl nsw i32 %mul44, 1
+  %cmp2 = icmp slt i32 %mul, %n
+  br i1 %cmp2, label %for.body8.lr.ph, label %for.cond.cleanup3
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
+  %indvars.iv = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next, %for.body8 ]
+  %3 = add nsw i64 %indvars.iv, %2
+  %4 = mul nsw i64 %3, %0
+  %5 = add nsw i64 %4, %indvars.iv53
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %5
+  %6 = load double, double* %arrayidx, align 8
+  %7 = mul nsw i64 %indvars.iv, %0
+  %8 = add nsw i64 %7, %indvars.iv53
+  %arrayidx14 = getelementptr inbounds double, double* %A, i64 %8
+  %9 = load double, double* %arrayidx14, align 8
+  %add15 = fadd double %6, %9
+  store double %add15, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, %1
+  %cmp6 = icmp slt i64 %indvars.iv.next, %0
+  br i1 %cmp6, label %for.body8, label %for.cond.cleanup7
+}
+
+attributes #0 = { norecurse nounwind uwtable }
Index: test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/Loops/SingleBlockLoop.ll
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=x86-- -analyze -loop-divergence %s | FileCheck %s
+
+; CHECK: Divergence of loop for.body {
+; CHECK-NEXT: DIVERGENT:  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT: DIVERGENT:  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+; CHECK-NEXT: DIVERGENT:  store float 4.200000e+01, float* %arrayidx, align 4
+; CHECK-NEXT: DIVERGENT:  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-NEXT: }
+define void @test1(float* nocapture %A, i64 %n) #0 {
+  entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float 4.200000e+01, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
Index: test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/daorder.ll
@@ -0,0 +1,47 @@
+; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @daorder(i32 %n) {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'daorder'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond = icmp slt i32 %tid, 0
+  br i1 %cond, label %A, label %B ; divergent
+; CHECK: DIVERGENT: br i1 %cond,
+A:
+  %defAtA = add i32 %n, 1 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtA = 
+  br label %C
+B:
+  %defAtB = add i32 %n, 2 ; uniform
+; CHECK-NOT: DIVERGENT: %defAtB = 
+  br label %C
+C:
+  %defAtC = phi i32 [ %defAtA, %A ], [ %defAtB, %B ] ; divergent
+; CHECK: DIVERGENT: %defAtC =
+  br label %D
+
+D:
+  %i = phi i32 [0, %C], [ %i.inc, %E ] ; uniform
+; CHECK-NOT: DIVERGENT: %i = phi
+  br label %E
+
+E:
+  %i.inc = add i32 %i, 1
+  %loopCnt = icmp slt i32 %i.inc, %n
+; CHECK-NOT: DIVERGENT: %loopCnt =
+  br i1 %loopCnt, label %D, label %exit
+
+exit:
+  ret i32 %n
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32)* @daorder, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
+++ test/Analysis/DivergenceAnalysis/NVPTX/diverge.ll
@@ -1,11 +1,11 @@
-; RUN: opt %s -analyze -divergence | FileCheck %s
+; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
 define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cond = icmp slt i32 %n, 0
@@ -27,7 +27,7 @@
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
 define i32 @sync(i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
   %cond = icmp slt i32 %tid, 5
@@ -48,7 +48,7 @@
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
 define i32 @mixed(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
   %cond = icmp slt i32 %tid, 5
@@ -73,7 +73,7 @@
 
 ; We conservatively treats all parameters of a __device__ function as divergent.
 define i32 @device(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'device'
 ; CHECK: DIVERGENT: i32 %n
 ; CHECK: DIVERGENT: i32 %a
 ; CHECK: DIVERGENT: i32 %b
@@ -98,7 +98,7 @@
 ;
 ; The i defined in the loop is used outside.
 define i32 @loop() {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
   br label %loop
@@ -120,7 +120,7 @@
 
 ; Same as @loop, but the loop is in the LCSSA form.
 define i32 @lcssa() {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'lcssa'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   br label %loop
@@ -142,49 +142,6 @@
   ret i32 1
 }
 
-; This test contains an unstructured loop.
-;           +-------------- entry ----------------+
-;           |                                     |
-;           V                                     V
-; i1 = phi(0, i3)                            i2 = phi(0, i3)
-;     j1 = i1 + 1 ---> i3 = phi(j1, j2) <--- j2 = i2 + 2
-;           ^                 |                   ^
-;           |                 V                   |
-;           +-------- switch (tid / i3) ----------+
-;                             |
-;                             V
-;                        if (i3 == 5) // divergent
-; because sync dependent on (tid / i3).
-define i32 @unstructured_loop(i1 %entry_cond) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop'
-entry:
-  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
-loop_entry_1:
-  %i1 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
-  %j1 = add i32 %i1, 1
-  br label %loop_body
-loop_entry_2:
-  %i2 = phi i32 [ 0, %entry ], [ %i3, %loop_latch ]
-  %j2 = add i32 %i2, 2
-  br label %loop_body
-loop_body:
-  %i3 = phi i32 [ %j1, %loop_entry_1 ], [ %j2, %loop_entry_2 ]
-  br label %loop_latch
-loop_latch:
-  %div = sdiv i32 %tid, %i3
-  switch i32 %div, label %branch [ i32 1, label %loop_entry_1
-                                   i32 2, label %loop_entry_2 ]
-branch:
-  %cmp = icmp eq i32 %i3, 5
-  br i1 %cmp, label %then, label %else
-; CHECK: DIVERGENT: br i1 %cmp,
-then:
-  ret i32 0
-else:
-  ret i32 1
-}
-
 ; Verifies sync-dependence is computed correctly in the absense of loops.
 define i32 @sync_no_loop(i32 %arg) {
 entry:
@@ -210,10 +167,9 @@
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5}
+!nvvm.annotations = !{!0, !1, !2, !3, !4}
 !0 = !{i32 (i32, i32, i32)* @no_diverge, !"kernel", i32 1}
 !1 = !{i32 (i32, i32)* @sync, !"kernel", i32 1}
 !2 = !{i32 (i32, i32, i32)* @mixed, !"kernel", i32 1}
 !3 = !{i32 ()* @loop, !"kernel", i32 1}
-!4 = !{i32 (i1)* @unstructured_loop, !"kernel", i32 1}
-!5 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}
+!4 = !{i32 (i32)* @sync_no_loop, !"kernel", i32 1}
Index: test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
===================================================================
--- /dev/null
+++ test/Analysis/DivergenceAnalysis/NVPTX/hidden_diverge.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -analyze -divergence -use-gpu-da | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'hidden_diverge'
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cond.var = icmp slt i32 %tid, 0
+  br i1 %cond.var, label %B, label %C ; divergent
+; CHECK: DIVERGENT: br i1 %cond.var,
+B:
+  %cond.uni = icmp slt i32 %n, 0
+  br i1 %cond.uni, label %C, label %merge ; uniform
+; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
+C:
+  %phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B  ]
+; CHECK: DIVERGENT: %phi.var.hidden = phi i32
+  br label %merge
+merge:
+  %phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
+; CHECK: DIVERGENT: %phi.ipd = phi i32
+  ret i32 %phi.ipd
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+!nvvm.annotations = !{!0}
+!0 = !{i32 (i32, i32, i32)* @hidden_diverge, !"kernel", i32 1}
Index: test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll
===================================================================
--- test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll
+++ test/Analysis/KernelDivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_ps':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_ps':
 ; CHECK: DIVERGENT:
 ; CHECK-NOT: %arg0
 ; CHECK-NOT: %arg1
@@ -14,7 +14,7 @@
   ret void
 }
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_amdgpu_kernel':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_amdgpu_kernel':
 ; CHECK-NOT: %arg0
 ; CHECK-NOT: %arg1
 ; CHECK-NOT: %arg2
@@ -26,7 +26,7 @@
   ret void
 }
 
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'test_c':
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'test_c':
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
Index: test/Analysis/KernelDivergenceAnalysis/AMDGPU/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Analysis/KernelDivergenceAnalysis/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
Index: test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll
===================================================================
--- test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll
+++ test/Analysis/KernelDivergenceAnalysis/NVPTX/diverge.ll
@@ -5,7 +5,7 @@
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
 define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'no_diverge'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %cond = icmp slt i32 %n, 0
@@ -27,7 +27,7 @@
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
 define i32 @sync(i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'sync'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
   %cond = icmp slt i32 %tid, 5
@@ -48,7 +48,7 @@
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
 define i32 @mixed(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'mixed'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
   %cond = icmp slt i32 %tid, 5
@@ -73,7 +73,7 @@
 
 ; We conservatively treats all parameters of a __device__ function as divergent.
 define i32 @device(i32 %n, i32 %a, i32 %b) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'device'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'device'
 ; CHECK: DIVERGENT: i32 %n
 ; CHECK: DIVERGENT: i32 %a
 ; CHECK: DIVERGENT: i32 %b
@@ -98,7 +98,7 @@
 ;
 ; The i defined in the loop is used outside.
 define i32 @loop() {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'loop'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
   br label %loop
@@ -120,7 +120,7 @@
 
 ; Same as @loop, but the loop is in the LCSSA form.
 define i32 @lcssa() {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'lcssa'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'lcssa'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   br label %loop
@@ -156,7 +156,7 @@
 ;                        if (i3 == 5) // divergent
 ; because sync dependent on (tid / i3).
 define i32 @unstructured_loop(i1 %entry_cond) {
-; CHECK-LABEL: Printing analysis 'Divergence Analysis' for function 'unstructured_loop'
+; CHECK-LABEL: Printing analysis 'Kernel Divergence Analysis' for function 'unstructured_loop'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   br i1 %entry_cond, label %loop_entry_1, label %loop_entry_2
Index: test/Analysis/KernelDivergenceAnalysis/NVPTX/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Analysis/KernelDivergenceAnalysis/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True