diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -76,8 +76,36 @@
 
 } // end anonymous namespace
 
+static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
+  auto Intrinsic = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
+                               : Intrinsic::amdgcn_dispatch_ptr;
+  StringRef Name = Intrinsic::getName(Intrinsic);
+  return M.getFunction(Name);
+}
+
+static LoadInst *getUniqueLoadUser(Value *V) {
+  if (LoadInst *L = dyn_cast<LoadInst>(V)) {
+    return L->isSimple() ? L : nullptr;
+  }
+
+  if (!V->getType()->isPointerTy()) {
+    return nullptr;
+  }
+
+  LoadInst *UniqueLoad = nullptr;
+  for (User *U : V->users()) {
+    if (LoadInst *L = getUniqueLoadUser(U)) {
+      if (UniqueLoad) {
+        return nullptr;
+      }
+      UniqueLoad = L;
+    }
+  }
+  return UniqueLoad;
+}
+
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
-  Function *F = CI->getParent()->getParent();
+  Function *F = CI->getFunction();
 
   auto MD = F->getMetadata("reqd_work_group_size");
   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
@@ -98,33 +126,23 @@
   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
   for (User *U : CI->users()) {
-    if (!U->hasOneUse())
+    LoadInst *Load = getUniqueLoadUser(U);
+    if (!Load) {
       continue;
-
-    int64_t Offset = 0;
-    auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
-    auto *BCI = dyn_cast<BitCastInst>(U);
-    if (!Load && !BCI) {
-      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
-        continue;
-      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
-      BCI = dyn_cast<BitCastInst>(*U->user_begin());
     }
 
-    if (BCI) {
-      if (!BCI->hasOneUse())
-        continue;
-      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
-    }
-
-    if (!Load || !Load->isSimple())
+    APInt Offset(64, 0U);
+    if (Load != U &&
+        U->stripAndAccumulateConstantOffsets(DL, Offset, true) != CI) {
       continue;
+    }
 
     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
 
     // TODO: Handle merged loads.
+    auto const OffsetValue = Offset.getSExtValue();
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
-      switch (Offset) {
+      switch (OffsetValue) {
       case HIDDEN_BLOCK_COUNT_X:
         if (LoadSize == 4)
           BlockCounts[0] = Load;
@@ -165,7 +183,7 @@
         break;
       }
     } else { // Base is DispatchPtr.
-      switch (Offset) {
+      switch (OffsetValue) {
       case WORKGROUP_SIZE_X:
         if (LoadSize == 2)
           GroupSizes[0] = Load;
@@ -315,17 +333,8 @@
 // TargetPassConfig for subtarget.
 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   bool MadeChange = false;
-  Function *BasePtr = nullptr;
   bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
-  if (IsV5OrAbove) {
-    StringRef ImplicitArgPtrName =
-        Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
-    BasePtr = M.getFunction(ImplicitArgPtrName);
-  } else { // Pre-V5.
-    StringRef DispatchPtrName =
-        Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
-    BasePtr = M.getFunction(DispatchPtrName);
-  }
+  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
 
   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
     return false;
@@ -356,17 +365,8 @@
 
 PreservedAnalyses
 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
-  Function *BasePtr = nullptr;
   bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
-  if (IsV5OrAbove) {
-    StringRef ImplicitArgPtrName =
-        Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
-    BasePtr = F.getParent()->getFunction(ImplicitArgPtrName);
-  } else { // Pre_V5.
-    StringRef DispatchPtrName =
-        Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
-    BasePtr = F.getParent()->getFunction(DispatchPtrName);
-  }
+  Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
 
   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
     return PreservedAnalyses::all();