Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,9 +14,11 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -70,9 +72,61 @@
   I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
 }
 
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
-  const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
-  return !MSSA->isLiveOnEntryDef(MA);
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
+  MemorySSAWalker *Walker = MSSA->getSkipSelfWalker();
+  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+  SmallSet<MemoryAccess *, 8> Visited;
+
+  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+  // Start with a nearest dominating clobbering access, it will be either
+  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+  // MemoryPhi if several MemoryDefs can define this memory state. In that
+  // case add all Defs to WorkList and continue going up and checking all
+  // the definitions of this memory location until the root. When all the
+  // defs are exhausted and came to the entry state we have no clobber.
+  // Along the scan ignore barriers and fences which are considered clobbers
+  // by the MemorySSA, but not really writing anything into the memory.
+  while (!WorkList.empty()) {
+    MemoryAccess *MA = WorkList.pop_back_val();
+    if (!Visited.insert(MA).second)
+      continue;
+
+    if (MSSA->isLiveOnEntryDef(MA))
+      continue;
+
+    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+      Instruction *DefInst = Def->getMemoryInst();
+
+      LLVM_DEBUG(dbgs() << "  Def: " << *DefInst << '\n');
+
+      if (isa<FenceInst>(DefInst)) {
+        WorkList.push_back(Def->getDefiningAccess());
+        continue;
+      }
+
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::amdgcn_s_barrier:
+        case Intrinsic::amdgcn_wave_barrier:
+          WorkList.push_back(Def->getDefiningAccess());
+          continue;
+        default:
+          break;
+        }
+      }
+
+      LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
+      return true;
+    }
+
+    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+    for (auto &Use : Phi->incoming_values())
+      WorkList.push_back(cast<MemoryAccess>(&Use));
+  }
+
+  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
+  return false;
 }
 
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
Index: llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -0,0 +1,187 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that barrier or fence in between of loads is not considered a clobber
+; for the purpose of converting vector loads into scalar.
+
+; GCN-LABEL: {{^}}simple_barrier:
+; GCN: s_load_dword s
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_barrier
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: ; wave barrier
+; GCN-NOT: global_load_dword
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  fence syncscope("workgroup") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  tail call void @llvm.amdgcn.wave.barrier()
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_no_clobber:
+; GCN: s_load_dword s
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: s_barrier
+; GCN-NOT: global_load_dword
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  tail call void @llvm.amdgcn.s.barrier()
+  br label %if.end
+
+if.else:
+  fence syncscope("workgroup") release
+  br label %if.end
+
+if.end:
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_clobber1:
+; GCN: s_load_dword s
+; GCN: s_barrier
+; GCN: global_store_dword
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  store i32 1, i32 addrspace(1)* %gep, align 4
+  br label %if.end
+
+if.else:
+  tail call void @llvm.amdgcn.s.barrier()
+  br label %if.end
+
+if.end:
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}memory_phi_clobber2:
+; GCN-DAG: s_load_dword s
+; GCN-DAG: global_store_dword
+; GCN: s_barrier
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  tail call void @llvm.amdgcn.s.barrier()
+  br label %if.end
+
+if.else:
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  store i32 1, i32 addrspace(1)* %gep, align 4
+  br label %if.end
+
+if.end:
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}clobbering_loop:
+; GCN: s_load_dword s
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i1 %cc) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  br label %while.cond
+
+while.cond:
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  tail call void @llvm.amdgcn.wave.barrier()
+  br i1 %cc, label %while.cond, label %end
+
+end:
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_clobbering_loop:
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @no_clobbering_loop(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  br label %while.cond
+
+while.cond:
+  %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ]
+  %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ]
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %acc
+  tail call void @llvm.amdgcn.wave.barrier()
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, %n
+  br i1 %cc, label %while.cond, label %end
+
+end:
+  store i32 %i3, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}clobber_by_atomic_load:
+; GCN: s_load_dword s
+; GCN: global_load_dword {{.*}} glc
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) {
+bb:
+  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
+  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i3 = add i32 %i2, %i
+  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+declare void @llvm.amdgcn.wave.barrier()