Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,12 +14,11 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -53,7 +52,6 @@
 
   void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
-  bool isClobberedInFunction(LoadInst * Load);
 };
 
 } // End anonymous namespace
@@ -75,81 +73,6 @@
   I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
 }
 
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
-  MemorySSAWalker *Walker = MSSA->getWalker();
-  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
-  SmallSet<MemoryAccess *, 8> Visited;
-  MemoryLocation Loc(MemoryLocation::get(Load));
-
-  const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
-    Instruction *DefInst = Def->getMemoryInst();
-    LLVM_DEBUG(dbgs() << "  Def: " << *DefInst << '\n');
-
-    if (isa<FenceInst>(DefInst))
-      return false;
-
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_s_barrier:
-      case Intrinsic::amdgcn_wave_barrier:
-        return false;
-      default:
-        break;
-      }
-    }
-
-    // Ignore atomics not aliasing with the original load, any atomic is a
-    // universal MemoryDef from MSSA's point of view too, just like a fence.
-    const auto checkNoAlias = [this, Load](auto I) -> bool {
-      return I && AA->isNoAlias(I->getPointerOperand(),
-                                Load->getPointerOperand());
-    };
-
-    if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
-        checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
-      return false;
-
-    return true;
-  };
-
-  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
-
-  // Start with a nearest dominating clobbering access, it will be either
-  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
-  // MemoryPhi if several MemoryDefs can define this memory state. In that
-  // case add all Defs to WorkList and continue going up and checking all
-  // the definitions of this memory location until the root. When all the
-  // defs are exhausted and came to the entry state we have no clobber.
-  // Along the scan ignore barriers and fences which are considered clobbers
-  // by the MemorySSA, but not really writing anything into the memory.
-  while (!WorkList.empty()) {
-    MemoryAccess *MA = WorkList.pop_back_val();
-    if (!Visited.insert(MA).second)
-      continue;
-
-    if (MSSA->isLiveOnEntryDef(MA))
-      continue;
-
-    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
-      if (isReallyAClobber(Def)) {
-        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
-        return true;
-      }
-
-      WorkList.push_back(
-          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
-      continue;
-    }
-
-    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
-    for (auto &Use : Phi->incoming_values())
-      WorkList.push_back(cast<MemoryAccess>(&Use));
-  }
-
-  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
-  return false;
-}
-
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
   if (DA->isUniform(&I))
     setUniformMetadata(&I);
@@ -169,8 +92,7 @@
   if (!isEntryFunc)
     return;
   bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-  bool NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
-  if (NotClobbered)
+  if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA))
     setNoClobberMetadata(&I);
 }
 
Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -16,7 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
@@ -30,6 +32,8 @@
 class AMDGPUPromoteKernelArguments : public FunctionPass {
   MemorySSA *MSSA;
 
+  AliasAnalysis *AA;
+
   Instruction *ArgCastInsertPt;
 
   SmallVector<Value *> Ptrs;
@@ -43,11 +47,12 @@
 
   AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
 
-  bool run(Function &F, MemorySSA &MSSA);
+  bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA);
 
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<MemorySSAWrapperPass>();
     AU.setPreservesAll();
   }
@@ -75,9 +80,8 @@
            PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
           LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
         break;
-      const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
       // TODO: This load poprobably can be promoted to constant address space.
-      if (MSSA->isLiveOnEntryDef(MA))
+      if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA))
         Ptrs.push_back(LD);
       break;
     }
@@ -131,7 +135,8 @@
   return InsPt;
 }
 
-bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
+                                       AliasAnalysis &AA) {
   if (skipFunction(F))
     return false;
 
@@ -141,6 +146,7 @@
 
   ArgCastInsertPt = &*getInsertPt(*F.begin());
   this->MSSA = &MSSA;
+  this->AA = &AA;
 
   for (Argument &Arg : F.args()) {
     if (Arg.use_empty())
@@ -166,11 +172,13 @@
 
 bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
   MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
-  return run(F, MSSA);
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  return run(F, MSSA, AA);
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                       "AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                     "AMDGPU Promote Kernel Arguments", false, false)
@@ -185,7 +193,8 @@
 AMDGPUPromoteKernelArgumentsPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
     PA.preserve<MemorySSAAnalysis>();
Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
===================================================================
--- /dev/null
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -0,0 +1,35 @@
+//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+
+namespace llvm {
+
+class AAResults;
+class LoadInst;
+class MemoryDef;
+class MemorySSA;
+class Value;
+
+namespace AMDGPU {
+
+/// Given a \p Def clobbering a load from \p Ptr accroding to the MSSA check
+/// if this is actually a memory update or an artifical clobber to facilitate
+/// ordering constraints.
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
+
+/// Check is a \p Load is clobbered in its function.
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+#define DEBUG_TYPE "amdgpu-memory-utils"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
+  Instruction *DefInst = Def->getMemoryInst();
+
+  if (isa<FenceInst>(DefInst))
+    return false;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_s_barrier:
+    case Intrinsic::amdgcn_wave_barrier:
+      return false;
+    default:
+      break;
+    }
+  }
+
+  // Ignore atomics not aliasing with the original load, any atomic is a
+  // universal MemoryDef from MSSA's point of view too, just like a fence.
+  const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
+    return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
+  };
+
+  if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
+      checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
+    return false;
+
+  return true;
+}
+
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA) {
+  MemorySSAWalker *Walker = MSSA->getWalker();
+  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+  SmallSet<MemoryAccess *, 8> Visited;
+  MemoryLocation Loc(MemoryLocation::get(Load));
+
+  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+  // Start with a nearest dominating clobbering access, it will be either
+  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+  // MemoryPhi if several MemoryDefs can define this memory state. In that
+  // case add all Defs to WorkList and continue going up and checking all
+  // the definitions of this memory location until the root. When all the
+  // defs are exhausted and came to the entry state we have no clobber.
+  // Along the scan ignore barriers and fences which are considered clobbers
+  // by the MemorySSA, but not really writing anything into the memory.
+  while (!WorkList.empty()) {
+    MemoryAccess *MA = WorkList.pop_back_val();
+    if (!Visited.insert(MA).second)
+      continue;
+
+    if (MSSA->isLiveOnEntryDef(MA))
+      continue;
+
+    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+      LLVM_DEBUG(dbgs() << "  Def: " << *Def->getMemoryInst() << '\n');
+
+      if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
+        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
+        return true;
+      }
+
+      WorkList.push_back(
+          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
+      continue;
+    }
+
+    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+    for (auto &Use : Phi->incoming_values())
+      WorkList.push_back(cast<MemoryAccess>(&Use));
+  }
+
+  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
+  return false;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
Index: llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,6 +2,7 @@
   AMDGPUAsmUtils.cpp
   AMDGPUBaseInfo.cpp
   AMDGPULDSUtils.cpp
+  AMDGPUMemoryUtils.cpp
   AMDGPUPALMetadata.cpp
   AMDKernelCodeTUtils.cpp
 
Index: llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -314,4 +314,31 @@
   ret void
 }
 
+; GCN-LABEL: ptr_nest_3_barrier:
+; GCN-COUNT-2: global_load_dwordx2
+; GCN:         global_store_dword
+define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) {
+; CHECK-LABEL: @ptr_nest_3_barrier(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
+; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
+  tail call void @llvm.amdgcn.s.barrier()
+  %p2 = load float**, float** addrspace(1)* %p1, align 8
+  %p3 = load float*, float** %p2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.s.barrier()