Index: llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h
+++ llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h
@@ -73,9 +73,12 @@
   /// operands
   bool isAlwaysUniform(const Value &Val) const;
 
-  /// \brief Whether \p Val is a divergent value
+  /// \brief Whether \p Val is divergent at its definition.
   bool isDivergent(const Value &Val) const;
 
+  /// \brief Whether \p U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use &U) const;
+
   void print(raw_ostream &OS, const Module *) const;
 
 private:
@@ -189,12 +192,19 @@
   /// The GPU kernel this analysis result is for
   const Function &getFunction() const { return DA.getFunction(); }
 
-  /// Whether \p V is divergent.
+  /// Whether \p V is divergent at its definition.
   bool isDivergent(const Value &V) const;
 
-  /// Whether \p V is uniform/non-divergent
+  /// Whether \p U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use &U) const;
+
+  /// Whether \p V is uniform/non-divergent.
   bool isUniform(const Value &V) const { return !isDivergent(V); }
 
+  /// Whether \p U is uniform/non-divergent. Uses of a uniform value can be
+  /// divergent.
+  bool isUniformUse(const Use &U) const { return !isDivergentUse(U); }
+
   /// Print all divergent values in the kernel.
   void print(raw_ostream &OS, const Module *) const;
 };
Index: llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h
+++ llvm/trunk/include/llvm/Analysis/LegacyDivergenceAnalysis.h
@@ -39,17 +39,18 @@
   void print(raw_ostream &OS, const Module *) const override;
 
   // Returns true if V is divergent at its definition.
-  //
-  // Even if this function returns false, V may still be divergent when used
-  // in a different basic block.
   bool isDivergent(const Value *V) const;
 
+  // Returns true if U is divergent. Uses of a uniform value can be divergent.
+  bool isDivergentUse(const Use *U) const;
+
   // Returns true if V is uniform/non-divergent.
-  //
-  // Even if this function returns true, V may still be divergent when used
-  // in a different basic block.
   bool isUniform(const Value *V) const { return !isDivergent(V); }
 
+  // Returns true if U is uniform/non-divergent. Uses of a uniform value can be
+  // divergent.
+  bool isUniformUse(const Use *U) const { return !isDivergentUse(U); }
+
   // Keep the analysis results uptodate by removing an erased value.
   void removeValue(const Value *V) { DivergentValues.erase(V); }
 
@@ -62,6 +63,9 @@
 
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
+
+  // Stores divergent uses of possibly uniform values.
+  DenseSet<const Use *> DivergentUses;
 };
 } // End llvm namespace
 
Index: llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp
===================================================================
--- llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp
+++ llvm/trunk/lib/Analysis/DivergenceAnalysis.cpp
@@ -412,6 +412,12 @@
   return DivergentValues.find(&V) != DivergentValues.end();
 }
 
+bool DivergenceAnalysis::isDivergentUse(const Use &U) const {
+  Value &V = *U.get();
+  Instruction &I = *cast<Instruction>(U.getUser());
+  return isDivergent(V) || isTemporalDivergent(*I.getParent(), V);
+}
+
 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
   if (DivergentValues.empty())
     return;
@@ -449,6 +455,10 @@
   return DA.isDivergent(val);
 }
 
+bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const {
+  return DA.isDivergentUse(use);
+}
+
 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
   OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
   DA.print(OS, mod);
Index: llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp
===================================================================
--- llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ llvm/trunk/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -93,8 +93,9 @@
 class DivergencePropagator {
 public:
   DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
-                       PostDominatorTree &PDT, DenseSet<const Value *> &DV)
-      : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
+                       PostDominatorTree &PDT, DenseSet<const Value *> &DV,
+                       DenseSet<const Use *> &DU)
+      : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV), DU(DU) {}
   void populateWithSourcesOfDivergence();
   void propagate();
 
@@ -118,11 +119,14 @@
   PostDominatorTree &PDT;
   std::vector<Value *> Worklist; // Stack for DFS.
   DenseSet<const Value *> &DV;   // Stores all divergent values.
+  DenseSet<const Use *> &DU;   // Stores divergent uses of possibly uniform
+                               // values.
 };
 
 void DivergencePropagator::populateWithSourcesOfDivergence() {
   Worklist.clear();
   DV.clear();
+  DU.clear();
   for (auto &I : instructions(F)) {
     if (TTI.isSourceOfDivergence(&I)) {
       Worklist.push_back(&I);
@@ -197,8 +201,10 @@
   // dominators of TI until it is outside the influence region.
   BasicBlock *InfluencedBB = ThisBB;
   while (InfluenceRegion.count(InfluencedBB)) {
-    for (auto &I : *InfluencedBB)
-      findUsersOutsideInfluenceRegion(I, InfluenceRegion);
+    for (auto &I : *InfluencedBB) {
+      if (!DV.count(&I))
+        findUsersOutsideInfluenceRegion(I, InfluenceRegion);
+    }
     DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom();
     if (IDomNode == nullptr)
       break;
@@ -208,9 +214,10 @@
 
 void DivergencePropagator::findUsersOutsideInfluenceRegion(
     Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) {
-  for (User *U : I.users()) {
-    Instruction *UserInst = cast<Instruction>(U);
+  for (Use &Use : I.uses()) {
+    Instruction *UserInst = cast<Instruction>(Use.getUser());
     if (!InfluenceRegion.count(UserInst->getParent())) {
+      DU.insert(&Use);
       if (DV.insert(UserInst).second)
         Worklist.push_back(UserInst);
     }
@@ -320,6 +327,7 @@
     return false;
 
   DivergentValues.clear();
+  DivergentUses.clear();
   gpuDA = nullptr;
 
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -332,7 +340,7 @@
 
   } else {
     // run LLVM's existing DivergenceAnalysis
-    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
+    DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues, DivergentUses);
     DP.populateWithSourcesOfDivergence();
     DP.propagate();
   }
@@ -351,6 +359,13 @@
   return DivergentValues.count(V);
 }
 
+bool LegacyDivergenceAnalysis::isDivergentUse(const Use *U) const {
+  if (gpuDA) {
+    return gpuDA->isDivergentUse(*U);
+  }
+  return DivergentValues.count(U->get()) || DivergentUses.count(U);
+}
+
 void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
   if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
     return;
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -142,11 +142,11 @@
 
   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
-  if (DA->isDivergent(I.getOperand(PtrIdx))) {
+  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
     return;
   }
 
-  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
@@ -219,7 +219,7 @@
 
   const unsigned ValIdx = 0;
 
-  const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
@@ -232,7 +232,7 @@
   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
-    if (DA->isDivergent(I.getOperand(Idx))) {
+    if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
       return;
     }
   }
Index: llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/divergence-at-use.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true < %s -use-gpu-divergence-analysis | FileCheck %s
+
+@local = addrspace(3) global i32 undef
+
+define void @reducible(i32 %x) {
+; CHECK-LABEL: reducible:
+; CHECK-NOT: dpp
+entry:
+  br label %loop
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i1, %loop ]
+  %gep = getelementptr i32, i32 addrspace(3)* @local, i32 %i
+  %cond = icmp ult i32 %i, %x
+  %i1 = add i32 %i, 1
+  br i1 %cond, label %loop, label %exit
+exit:
+  %old = atomicrmw add i32 addrspace(3)* %gep, i32 %x acq_rel
+  ret void
+}