diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -81,10 +81,37 @@
   return M.getFunction(Name);
 }
 
-} // end anonymous namespace
+LoadInst *getUniqueSimpleLoadUser(Value *V) {
+  if (LoadInst *L = dyn_cast<LoadInst>(V)) {
+    return L->isSimple() ? L : nullptr;
+  }
+
+  if (!V->getType()->isPointerTy()) {
+    return nullptr;
+  }
+
+  LoadInst *UniqueLoad = nullptr;
+  for (User *U : V->users()) {
+    if (LoadInst *L = getUniqueSimpleLoadUser(U)) {
+      if (UniqueLoad && UniqueLoad != L)
+        return nullptr;
+      UniqueLoad = L;
+    }
+  }
+  return UniqueLoad;
+}
+
+auto m_MatchAnyOf(ArrayRef<Value *> V) {
+  struct anyofval_ty {
+    const ArrayRef<Value *> Vals;
+    anyofval_ty(const ArrayRef<Value *> V) : Vals(V) {}
+    bool match(Value *V) { return is_contained(Vals, V); }
+  };
+  return anyofval_ty(V);
+}
 
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
-  Function *F = CI->getParent()->getParent();
+  Function *F = CI->getFunction();
 
   auto MD = F->getMetadata("reqd_work_group_size");
   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
@@ -95,107 +122,87 @@
   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
     return false;
 
-  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
-  Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
-  Value *Remainders[3]  = {nullptr, nullptr, nullptr};
-  Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
+  SmallVector<Value *, 1> BlockCounts[3], GroupSizes[3], Remainders,
+      GridSizes[3];
 
   const DataLayout &DL = F->getParent()->getDataLayout();
 
   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
   for (User *U : CI->users()) {
-    if (!U->hasOneUse())
+    LoadInst *Load = getUniqueSimpleLoadUser(U);
+    if (!Load)
       continue;
 
-    int64_t Offset = 0;
-    auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
-    auto *BCI = dyn_cast<BitCastInst>(U);
-    if (!Load && !BCI) {
-      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
-        continue;
-      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
-      BCI = dyn_cast<BitCastInst>(*U->user_begin());
-    }
-
-    if (BCI) {
-      if (!BCI->hasOneUse())
-        continue;
-      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
-    }
-
-    if (!Load || !Load->isSimple())
+    APInt Offset(64, 0U);
+    if (Load != U &&
+        U->stripAndAccumulateConstantOffsets(DL, Offset, true) != CI)
       continue;
 
     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
 
     // TODO: Handle merged loads.
+    auto const OffsetValue = Offset.getSExtValue();
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
-      switch (Offset) {
+      switch (OffsetValue) {
       case HIDDEN_BLOCK_COUNT_X:
         if (LoadSize == 4)
-          BlockCounts[0] = Load;
+          BlockCounts[0].push_back(Load);
         break;
       case HIDDEN_BLOCK_COUNT_Y:
         if (LoadSize == 4)
-          BlockCounts[1] = Load;
+          BlockCounts[1].push_back(Load);
         break;
       case HIDDEN_BLOCK_COUNT_Z:
         if (LoadSize == 4)
-          BlockCounts[2] = Load;
+          BlockCounts[2].push_back(Load);
         break;
       case HIDDEN_GROUP_SIZE_X:
         if (LoadSize == 2)
-          GroupSizes[0] = Load;
+          GroupSizes[0].push_back(Load);
         break;
       case HIDDEN_GROUP_SIZE_Y:
         if (LoadSize == 2)
-          GroupSizes[1] = Load;
+          GroupSizes[1].push_back(Load);
         break;
       case HIDDEN_GROUP_SIZE_Z:
         if (LoadSize == 2)
-          GroupSizes[2] = Load;
+          GroupSizes[2].push_back(Load);
         break;
       case HIDDEN_REMAINDER_X:
-        if (LoadSize == 2)
-          Remainders[0] = Load;
-        break;
       case HIDDEN_REMAINDER_Y:
-        if (LoadSize == 2)
-          Remainders[1] = Load;
-        break;
       case HIDDEN_REMAINDER_Z:
         if (LoadSize == 2)
-          Remainders[2] = Load;
+          Remainders.push_back(Load);
         break;
       default:
         break;
       }
     } else { // Base is DispatchPtr.
-      switch (Offset) {
+      switch (OffsetValue) {
       case WORKGROUP_SIZE_X:
         if (LoadSize == 2)
-          GroupSizes[0] = Load;
+          GroupSizes[0].push_back(Load);
         break;
       case WORKGROUP_SIZE_Y:
         if (LoadSize == 2)
-          GroupSizes[1] = Load;
+          GroupSizes[1].push_back(Load);
         break;
       case WORKGROUP_SIZE_Z:
         if (LoadSize == 2)
-          GroupSizes[2] = Load;
+          GroupSizes[2].push_back(Load);
         break;
       case GRID_SIZE_X:
         if (LoadSize == 4)
-          GridSizes[0] = Load;
+          GridSizes[0].push_back(Load);
         break;
       case GRID_SIZE_Y:
         if (LoadSize == 4)
-          GridSizes[1] = Load;
+          GridSizes[1].push_back(Load);
         break;
       case GRID_SIZE_Z:
         if (LoadSize == 4)
-          GridSizes[2] = Load;
+          GridSizes[2].push_back(Load);
         break;
       default:
         break;
@@ -213,31 +220,29 @@
     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
     // for __ockl_get_local_size.
     for (int I = 0; I < 3; ++I) {
-      Value *BlockCount = BlockCounts[I];
-      if (!BlockCount)
-        continue;
-
       using namespace llvm::PatternMatch;
       auto GroupIDIntrin =
           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 
-      for (User *ICmp : BlockCount->users()) {
-        ICmpInst::Predicate Pred;
-        if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
-          if (Pred != ICmpInst::ICMP_ULT)
-            continue;
-          ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
-          MadeChange = true;
+      for (Value *BlockCount : BlockCounts[I]) {
+        for (User *ICmp : BlockCount->users()) {
+          ICmpInst::Predicate Pred;
+          if (match(ICmp,
+                    m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
+            if (Pred != ICmpInst::ICMP_ULT)
+              continue;
+            ICmp->replaceAllUsesWith(
+                llvm::ConstantInt::getTrue(ICmp->getType()));
+            MadeChange = true;
+          }
         }
       }
     }
 
     // All remainders should be 0 with uniform work group size.
     for (Value *Remainder : Remainders) {
-      if (!Remainder)
-        continue;
       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
       MadeChange = true;
     }
@@ -262,37 +267,38 @@
     // condition is false (except for group_id == 0, where the select result is
     // the same).
     for (int I = 0; I < 3; ++I) {
-      Value *GroupSize = GroupSizes[I];
-      Value *GridSize = GridSizes[I];
-      if (!GroupSize || !GridSize)
-        continue;
-
       using namespace llvm::PatternMatch;
       auto GroupIDIntrin =
           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+      auto MatchGroupSizesZExt = m_ZExt(m_MatchAnyOf(GroupSizes[I]));
 
-      for (User *U : GroupSize->users()) {
-        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
-        if (!ZextGroupSize)
+      for (Value *GridSize : GridSizes[I]) {
+        if (GroupSizes[I].empty())
           continue;
 
-        for (User *UMin : ZextGroupSize->users()) {
-          if (match(UMin,
-                    m_UMin(m_Sub(m_Specific(GridSize),
-                                 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
-                           m_Specific(ZextGroupSize)))) {
-            if (HasReqdWorkGroupSize) {
-              ConstantInt *KnownSize
-                = mdconst::extract<ConstantInt>(MD->getOperand(I));
-              UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
-                  KnownSize, UMin->getType(), false));
-            } else {
-              UMin->replaceAllUsesWith(ZextGroupSize);
+        for (User *Sub : GridSize->users()) {
+          Instruction *ZExtGroupSize;
+          auto MatchAndBindGroupSizesZExt =
+              m_CombineAnd(MatchGroupSizesZExt, m_Instruction(ZExtGroupSize));
+          if (!match(Sub,
+                     m_Sub(m_Specific(GridSize),
+                           m_Mul(GroupIDIntrin, MatchAndBindGroupSizesZExt))))
+            continue;
+          for (User *UMin : Sub->users()) {
+            if (match(UMin, m_UMin(m_Specific(Sub), MatchGroupSizesZExt))) {
+              if (HasReqdWorkGroupSize) {
+                ConstantInt *KnownSize =
+                    mdconst::extract<ConstantInt>(MD->getOperand(I));
+                UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
+                    KnownSize, UMin->getType(), false));
+              } else {
+                UMin->replaceAllUsesWith(ZExtGroupSize);
+              }
+
+              MadeChange = true;
             }
-
-            MadeChange = true;
           }
         }
       }
@@ -304,19 +310,18 @@
     return MadeChange;
 
   for (int I = 0; I < 3; I++) {
-    Value *GroupSize = GroupSizes[I];
-    if (!GroupSize)
-      continue;
-
-    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
-    GroupSize->replaceAllUsesWith(
-        ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
-    MadeChange = true;
+    for (Value *GroupSize : GroupSizes[I]) {
+      ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+      GroupSize->replaceAllUsesWith(
+          ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
+      MadeChange = true;
+    }
   }
 
   return MadeChange;
 }
 
+} // end anonymous namespace
 
 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
 // TargetPassConfig for subtarget.
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt-opaque-ptr.ll
@@ -63,6 +63,32 @@
   ret void
 }
 
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define amdgpu_kernel void @get_local_size_x_2_block_count_users_opaque_pointer(i16 addrspace(1)* %out) #0 {
+; GCN-LABEL: @get_local_size_x_2_block_count_users_opaque_pointer(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; GCN-NEXT:    [[LOCAL_SIZE:%.*]] = load i16, ptr addrspace(4) [[GEP_LOCAL_SIZE]], align 4
+; GCN-NEXT:    store volatile i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
+; GCN-NEXT:    store volatile i16 [[LOCAL_SIZE]], ptr addrspace(1) [[OUT:%.*]], align 2
+; GCN-NEXT:    ret void
+;
+  %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %block.count.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  %cmp.id.count = icmp ult i32 %group.id, %block.count.x
+  %local.size.offset = select i1 %cmp.id.count, i64 12, i64 18
+  %gep.local.size = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset
+  %local.size = load i16, ptr addrspace(4) %gep.local.size, align 2
+  store volatile i16 %local.size, i16 addrspace(1)* %out
+  %cmp.id.count.1 = icmp ult i32 %group.id, %block.count.x
+  %local.size.offset.1 = select i1 %cmp.id.count.1, i64 12, i64 18
+  %gep.local.size.1 = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 %local.size.offset.1
+  %local.size.1 = load i16, ptr addrspace(4) %gep.local.size.1, align 2
+  store volatile i16 %local.size.1, i16 addrspace(1)* %out
+  ret void
+}
+
 declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -93,7 +93,7 @@
 ; GCN-NEXT:    ret void
 ;
   %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
+  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 20
   %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
   %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2
   store i16 %remainder.y, i16 addrspace(1)* %out
@@ -107,13 +107,37 @@
 ; GCN-NEXT:    ret void
 ;
   %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
+  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 22
   %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
   %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2
   store i16 %remainder.z, i16 addrspace(1)* %out
   ret void
 }
 
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define amdgpu_kernel void @get_remainder_xyz(i16 addrspace(1)* %out) #0 {
+; GCN-LABEL: @get_remainder_xyz(
+; GCN-NEXT:    store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store volatile i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    ret void
+;
+  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
+  %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
+  %remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2
+  store volatile i16 %remainder.x, i16 addrspace(1)* %out
+  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 20
+  %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
+  %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2
+  store volatile i16 %remainder.y, i16 addrspace(1)* %out
+  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 22
+  %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
+  %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2
+  store volatile i16 %remainder.z, i16 addrspace(1)* %out
+  ret void
+}
+
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
 define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 {
 ; GCN-LABEL: @get_work_group_size_x(
@@ -210,6 +234,30 @@
   ret void
 }
 
+define amdgpu_kernel void @get_all_work_group_sizes_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
+; GCN-LABEL: @get_all_work_group_sizes_reqd(
+; GCN-NEXT:    store volatile i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store volatile i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    store volatile i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2
+; GCN-NEXT:    ret void
+;
+  %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
+  %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
+  %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
+  %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
+  %bc.y.0 = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
+  %bc.y = bitcast i16 addrspace(4)* %bc.y.0 to i16 addrspace(4)*
+  %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
+  %bc.z.select = select i1 true, i16 addrspace(4)* %bc.z, i16 addrspace(4)* %bc.z
+  %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
+  %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
+  %group.size.z = load i16, i16 addrspace(4)* %bc.z.select, align 2
+  store volatile i16 %group.size.x, i16 addrspace(1)* %out
+  store volatile i16 %group.size.y, i16 addrspace(1)* %out
+  store volatile i16 %group.size.z, i16 addrspace(1)* %out
+  ret void
+}
 
 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -372,6 +372,96 @@
   ret void
 }
 
+; CHECK-LABEL: @all_local_size_twice(
+; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
+; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
+; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
+; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
+; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
+; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
+define amdgpu_kernel void @all_local_size_twice(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
+  %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+  %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
+  %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
+  %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
+  %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
+  %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
+  %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
+  %tmp29.i = zext i16 %tmp8.i to i32
+  %tmp30.i = mul i32 %tmp2.i, %tmp29.i
+  %tmp31.i = sub i32 %tmp5.i, %tmp30.i
+  %umin0 = call i32 @llvm.umin.i32(i32 %tmp31.i, i32 %tmp29.i)
+  %tmp34.i = zext i32 %umin0 to i64
+  %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
+  %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
+  %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
+  %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
+  %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
+  %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
+  %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
+  %tmp29.i9 = zext i16 %tmp16.i to i32
+  %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
+  %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
+  %umin1 = call i32 @llvm.umin.i32(i32 %tmp31.i11, i32 %tmp29.i9)
+  %tmp34.i14 = zext i32 %umin1 to i64
+  %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
+  %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
+  %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
+  %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
+  %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
+  %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
+  %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
+  %tmp29.i2 = zext i16 %tmp24.i to i32
+  %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
+  %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
+  %umin2 = call i32 @llvm.umin.i32(i32 %tmp31.i4, i32 %tmp29.i2)
+  %tmp34.i7 = zext i32 %umin2 to i64
+  %tmp2.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
+  %tmp3.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
+  %tmp4.i.1 = bitcast i8 addrspace(4)* %tmp3.i.1 to i32 addrspace(4)*
+  %tmp5.i.1 = load i32, i32 addrspace(4)* %tmp4.i.1, align 4
+  %tmp6.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
+  %tmp7.i.1 = bitcast i8 addrspace(4)* %tmp6.i.1 to i16 addrspace(4)*
+  %tmp8.i.1 = load i16, i16 addrspace(4)* %tmp7.i.1, align 4
+  %tmp29.i.1 = zext i16 %tmp8.i.1 to i32
+  %tmp30.i.1 = mul i32 %tmp2.i.1, %tmp29.i.1
+  %tmp31.i.1 = sub i32 %tmp5.i.1, %tmp30.i.1
+  %umin0.1 = call i32 @llvm.umin.i32(i32 %tmp31.i.1, i32 %tmp29.i.1)
+  %tmp34.i.1 = zext i32 %umin0.1 to i64
+  %tmp10.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
+  %tmp11.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
+  %tmp12.i.1 = bitcast i8 addrspace(4)* %tmp11.i.1 to i32 addrspace(4)*
+  %tmp13.i.1 = load i32, i32 addrspace(4)* %tmp12.i.1, align 8
+  %tmp14.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
+  %tmp15.i.1 = bitcast i8 addrspace(4)* %tmp14.i.1 to i16 addrspace(4)*
+  %tmp16.i.1 = load i16, i16 addrspace(4)* %tmp15.i.1, align 2
+  %tmp29.i9.1 = zext i16 %tmp16.i.1 to i32
+  %tmp30.i10.1 = mul i32 %tmp10.i.1, %tmp29.i9.1
+  %tmp31.i11.1 = sub i32 %tmp13.i.1, %tmp30.i10.1
+  %umin1.1 = call i32 @llvm.umin.i32(i32 %tmp31.i11.1, i32 %tmp29.i9) ; use tmp29.i9 instead of tmp29.i9.1
+  %tmp34.i14.1 = zext i32 %umin1.1 to i64
+  %tmp18.i.1 = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
+  %tmp19.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
+  %tmp20.i.1 = bitcast i8 addrspace(4)* %tmp19.i.1 to i32 addrspace(4)*
+  %tmp21.i.1 = load i32, i32 addrspace(4)* %tmp20.i.1, align 4
+  %tmp22.i.1 = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
+  %tmp23.i.1 = bitcast i8 addrspace(4)* %tmp22.i.1 to i16 addrspace(4)*
+  %tmp24.i.1 = load i16, i16 addrspace(4)* %tmp23.i.1, align 8
+  %tmp29.i2.1 = zext i16 %tmp24.i.1 to i32
+  %tmp30.i3.1 = mul i32 %tmp18.i.1, %tmp29.i2.1
+  %tmp31.i4.1 = sub i32 %tmp21.i.1, %tmp30.i3.1
+  %umin2.1 = call i32 @llvm.umin.i32(i32 %tmp31.i4.1, i32 %tmp29.i2.1)
+  %tmp34.i7.1 = zext i32 %umin2.1 to i64
+  store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i.1, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i14.1, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i7.1, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; TODO: Should be able to handle this, but not much reason to.
 ; CHECK-LABEL: @partial_load_group_size_x(
 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()