Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -20,9 +20,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -1068,8 +1068,26 @@
   FMF.setFast();
   Builder.setFastMathFlags(FMF);
 
-  if (divHasSpecialOptimization(I, X, Y))
+  if (divHasSpecialOptimization(I, X, Y)) {
+    // Special case for wave ID expression. The divide result is known wave
+    // uniform, but the source numerator is not. Hack in a readfirstlane to
+    // inform codegen of this.
+    //
+    // FIXME: Ideally the codegen divergence analysis would recognize the same
+    // pattern.
+    if (DA->isUniform(&I) && !DA->isUniform(I.getOperand(0))) {
+      Function *Readfirstlane =
+          Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_readfirstlane);
+
+      // Clone the operation to simplify the value replacement.
+      Value *NewOp = Builder.CreateBinOp(I.getOpcode(), X, Y, I.getName());
+      cast<Instruction>(NewOp)->copyIRFlags(&I);
+
+      return Builder.CreateCall(Readfirstlane, {NewOp});
+    }
+
     return nullptr;  // Keep it for later optimization.
+  }
 
   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
@@ -898,6 +899,26 @@
   return false;
 }
 
+/// Recognize a load from the offsets where workgroup sizes are stored in the
+/// dispatch packet.
+static bool isLoadFromGroupSize(const DataLayout &DL, Value *V,
+                                unsigned GroupIdx) {
+  Value *ZextSrc = nullptr;
+  if (!match(V, m_ZExt(m_Value(ZextSrc))) ||
+      !ZextSrc->getType()->isIntegerTy(16))
+    return false;
+
+  LoadInst *LI = dyn_cast<LoadInst>(ZextSrc);
+  if (!LI)
+    return false;
+
+  int64_t Offset = 0;
+  Value *PtrBase =
+      GetPointerBaseWithConstantOffset(LI->getPointerOperand(), Offset, DL);
+  return match(PtrBase, m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>()) &&
+         Offset == 4 + 2 * GroupIdx;
+}
+
 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
     switch (Intrinsic->getIntrinsicID()) {
@@ -919,6 +940,51 @@
     return false;
   }
 
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    switch (Inst->getOpcode()) {
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::UDiv:
+    case Instruction::URem: {
+      // Special case for pattern used for wave indexing calculations
+      // (blockIdx * blockDim + threadId) / wavesize
+      // (blockIdx * blockDim + threadId) % wavesize
+
+      Instruction *Num;
+      if (match(Inst, m_BinOp(m_Instruction(Num),
+                              m_SpecificInt(ST->getWavefrontSize())))) {
+        const DataLayout &DL = Inst->getModule()->getDataLayout();
+        for (unsigned Idx = 0; Idx < 3; ++Idx) {
+          auto ItemIDIntrin =
+              Idx == 0
+                  ? m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>()
+                  : (Idx == 1 ? m_Intrinsic<Intrinsic::amdgcn_workitem_id_y>()
+                              : m_Intrinsic<Intrinsic::amdgcn_workitem_id_z>());
+
+          auto GroupIDIntrin =
+              Idx == 0
+                  ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+                  : (Idx == 1
+                         ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                         : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+          Value *MulLHS;
+          // Match the expression with the uniform-work-group-size optimization
+          // applied.
+          if (match(Num, m_Add(m_Mul(m_Value(MulLHS), GroupIDIntrin),
+                               ItemIDIntrin)) &&
+              isLoadFromGroupSize(DL, MulLHS, Idx))
+            return true;
+        }
+      }
+
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
   if (!ExtValue)
     return false;
Index: llvm/test/Analysis/DivergenceAnalysis/AMDGPU/wave-id-computation.ll
===================================================================
--- /dev/null
+++ llvm/test/Analysis/DivergenceAnalysis/AMDGPU/wave-id-computation.ll
@@ -0,0 +1,387 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1031 -passes='amdgpu-lower-kernel-attributes,dce,print<divergence>' -disable-output %s 2>&1 | FileCheck -match-full-lines %s
+
+; Specially recognize workitem ID division by wavesize as uniform. Issue 54010.
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = sdiv exact i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_wid_x() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv exact i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_x':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = srem i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_lid_x() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = srem i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_y':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_wid_y() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.y()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 2, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 16
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 8
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_y':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = srem i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_lid_y() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.y()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 2, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 16
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 8
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = srem i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_z':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_wid_z() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.z()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 8
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 20
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_z':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = srem i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_lid_z() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.z()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 8
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 20
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = srem i32 %i14, 32
+  ret i32 %i15
+}
+
+; FIXME: Should be recognized as uniform
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_commute_mul':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_commute_mul() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i8, %i
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; TODO: Could handle this case
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_nonuniform_workgroup_size':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_nonuniform_workgroup_size() #3 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_offset':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_wrong_offset() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_udiv':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = udiv i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_wid_x_udiv() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = udiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_x_urem':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT:                  %i15 = urem i32 %i14, 32
+; CHECK-NEXT:                  ret i32 %i15
+define hidden i32 @calculate_lid_x_urem() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = urem i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_wave_size':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 64
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_wrong_wave_size() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 64
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_group_intrinsic':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_wrong_group_intrinsic() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.y()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = mul i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_not_mul':
+; CHECK:      DIVERGENT:       %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; CHECK-NEXT: DIVERGENT:       %i14 = add i32 %i12, %i13
+; CHECK-NEXT: DIVERGENT:       %i15 = sdiv i32 %i14, 32
+; CHECK-NEXT: DIVERGENT:       ret i32 %i15
+define hidden i32 @calculate_wid_x_not_mul() #0 {
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12
+  %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)*
+  %i7 = load i32, i32 addrspace(4)* %i6, align 4
+  %i8 = zext i16 %i4 to i32
+  %i9 = mul i32 %i, %i8
+  %i10 = sub i32 %i7, %i9
+  %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8)
+  %i12 = shl i32 %i11, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 32
+  ret i32 %i15
+}
+
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+declare i32 @llvm.amdgcn.workitem.id.z() #1
+declare i32 @llvm.umin.i32(i32, i32) #2
+
+attributes #0 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+attributes #3 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="false" }
+
+!0 = !{i16 1, i16 1025}
+!1 = !{}
+!2 = !{i32 0, i32 1024}
Index: llvm/test/CodeGen/AMDGPU/wave-id-computation.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/wave-id-computation.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Specially recognize workitem ID division by wavesize as uniform. Issue 54010.
+
+; With the sdiv recognized as uniform, the load from %gep.in should be turned into an s_load_dword.
+define amdgpu_kernel void @calculate_wid_x(i32 addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr) #0 {
+; CHECK-LABEL: calculate_wid_x:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_dword s9, s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s4, s9, 0xffff
+; CHECK-NEXT:    s_mul_i32 s4, s4, s8
+; CHECK-NEXT:    v_add_u32_e32 v0, s4, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 26, v1
+; CHECK-NEXT:    v_add_u32_e32 v0, v0, v1
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 6, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s4, v0
+; CHECK-NEXT:    s_ashr_i32 s5, s4, 31
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; CHECK-NEXT:    s_add_u32 s2, s2, s4
+; CHECK-NEXT:    s_addc_u32 s3, s3, s5
+; CHECK-NEXT:    s_load_dword s2, s[2:3], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s4
+; CHECK-NEXT:    s_addc_u32 s1, s1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i8 = zext i16 %i4 to i32
+  %i12 = mul i32 %i8, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 64
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in.ptr, i32 %i15
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out.ptr, i32 %i15
+  store i32 %load, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+define amdgpu_kernel void @calculate_wid_x_vgpr_user(i32 addrspace(1)* noalias %out.ptr) #0 {
+; CHECK-LABEL: calculate_wid_x_vgpr_user:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s2, s2, 0xffff
+; CHECK-NEXT:    s_mul_i32 s2, s2, s8
+; CHECK-NEXT:    v_add_u32_e32 v0, s2, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 26, v1
+; CHECK-NEXT:    v_add_u32_e32 v0, v0, v1
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 6, v0
+; CHECK-NEXT:    v_readfirstlane_b32 s2, v0
+; CHECK-NEXT:    s_ashr_i32 s3, s2, 31
+; CHECK-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; CHECK-NEXT:    s_add_u32 s0, s0, s2
+; CHECK-NEXT:    s_addc_u32 s1, s1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    global_store_dword v0, v0, s[0:1]
+; CHECK-NEXT:    s_endpgm
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i8 = zext i16 %i4 to i32
+  %i12 = mul i32 %i8, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 64
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out.ptr, i32 %i15
+  store i32 0, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+define hidden i32 @calculate_wid_x_commute() #0 {
+; CHECK-LABEL: calculate_wid_x_commute:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x4
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s4, s4, 0xffff
+; CHECK-NEXT:    s_mul_i32 s12, s12, s4
+; CHECK-NEXT:    v_add_u32_e32 v0, s12, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 26, v1
+; CHECK-NEXT:    v_add_u32_e32 v0, v0, v1
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 6, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i8 = zext i16 %i4 to i32
+  %i12 = mul i32 %i, %i8
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 64
+  ret i32 %i15
+}
+
+@ptr_table = external hidden unnamed_addr addrspace(4) constant [0 x void(i32)*], align 8
+
+; This does not require a waterfall loop if the sdiv is treated as uniform
+define amdgpu_kernel void @calculate_wid_x_reqd_uniform_user(float addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr) #0 {
+; CHECK-LABEL: calculate_wid_x_reqd_uniform_user:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT:    s_mov_b32 s12, s14
+; CHECK-NEXT:    s_load_dword s14, s[4:5], 0x4
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT:    s_add_u32 s0, s0, s17
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s13, s15
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s14, s14, 0xffff
+; CHECK-NEXT:    s_mul_i32 s14, s14, s12
+; CHECK-NEXT:    v_add_u32_e32 v3, s14, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 26, v4
+; CHECK-NEXT:    v_add_u32_e32 v3, v3, v4
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 6, v3
+; CHECK-NEXT:    v_readfirstlane_b32 s14, v3
+; CHECK-NEXT:    s_ashr_i32 s15, s14, 31
+; CHECK-NEXT:    s_lshl_b64 s[14:15], s[14:15], 3
+; CHECK-NEXT:    s_getpc_b64 s[18:19]
+; CHECK-NEXT:    s_add_u32 s18, s18, ptr_table@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s19, s19, ptr_table@rel32@hi+12
+; CHECK-NEXT:    s_add_u32 s14, s14, s18
+; CHECK-NEXT:    s_addc_u32 s15, s15, s19
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
+; CHECK-NEXT:    s_add_u32 s8, s8, 16
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_mov_b32 s14, s16
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; CHECK-NEXT:    s_endpgm
+bb:
+  %i = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4
+  %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)*
+  %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1
+  %i8 = zext i16 %i4 to i32
+  %i12 = mul i32 %i8, %i
+  %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+  %i14 = add i32 %i12, %i13
+  %i15 = sdiv i32 %i14, 64
+  %gep = getelementptr [0 x void(i32)*], [0 x void(i32)*] addrspace(4)* @ptr_table, i32 0, i32 %i15
+  %fptr = load void(i32)*, void(i32)* addrspace(4)* %gep
+  call void %fptr(i32 123)
+  ret void
+}
+
+define amdgpu_kernel void @compute_both(i32 addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr0, i32 addrspace(1)* noalias %in.ptr1) #0 {
+; CHECK-LABEL: compute_both:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s9, s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x10
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_b32 s4, s9, 0xffff
+; CHECK-NEXT:    s_mul_i32 s8, s8, s4
+; CHECK-NEXT:    v_add_u32_e32 v0, s8, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 26, v1
+; CHECK-NEXT:    v_add_u32_e32 v1, v0, v1
+; CHECK-NEXT:    v_and_b32_e32 v2, 0xffffffc0, v1
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 6, v1
+; CHECK-NEXT:    v_sub_u32_e32 v0, v0, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v4, s1
+; CHECK-NEXT:    v_add_co_u32_e32 v3, vcc, s0, v1
+; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CHECK-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; CHECK-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v2, s3
+; CHECK-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; CHECK-NEXT:    global_load_dword v2, v[3:4], off
+; CHECK-NEXT:    global_load_dword v5, v[0:1], off
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_add_u32_e32 v1, v2, v5
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT:    s_endpgm
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %gid.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
+  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
+  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
+  %group.size.x.ext = zext i16 %group.size.x to i32
+  %mul = mul i32 %gid.x, %group.size.x.ext
+  %tid = add i32 %mul, %id.x
+  %wid = sdiv i32 %tid, 64
+  %lid = srem i32 %tid, 64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %in.ptr0, i32 %wid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %in.ptr1, i32 %lid
+  %load0 = load i32, i32 addrspace(1)* %gep0
+  %load1 = load i32, i32 addrspace(1)* %gep1
+  %add = add i32 %load0, %load1
+  store i32 %add, i32 addrspace(1)* %out.ptr
+  ret void
+}
+
+declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+declare i32 @llvm.amdgcn.workitem.id.z() #1
+declare i32 @llvm.umin.i32(i32, i32) #2
+
+attributes #0 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+
+!0 = !{i16 1, i16 1025}
+!1 = !{}
+!2 = !{i32 0, i32 1024}