diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -918,20 +918,36 @@
     return false;
   }
 
+  // In most cases TID / wavefrontsize is uniform.
+  //
+  // However, if a kernel has uneven dimesions we can have a value of
+  // workitem-id-x divided by the wavefrontsize non-uniform. For example
+  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
+  // packed into a same wave which gives 1 and 0 after the division by 64
+  // respectively.
+  //
+  // FIXME: limit it to 1D kernels only, although that shall be possible
+  // to perform this optimization is the size of the X dimension is a power
+  // of 2, we just do not currently have infrastructure to query it.
   using namespace llvm::PatternMatch;
   uint64_t C;
   if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C))) ||
       match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
-                      m_ConstantInt(C))))
-    return C >= ST->getWavefrontSizeLog2();
+                      m_ConstantInt(C)))) {
+    const Function *F = cast<Instruction>(V)->getFunction();
+    return C >= ST->getWavefrontSizeLog2() &&
+           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+  }
 
   Value *Mask;
   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                        m_Value(Mask)))) {
-    const DataLayout &DL = cast<Instruction>(V)->getModule()->getDataLayout();
+    const Function *F = cast<Instruction>(V)->getFunction();
+    const DataLayout &DL = F->getParent()->getDataLayout();
     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
-           ST->getWavefrontSizeLog2();
+               ST->getWavefrontSizeLog2() &&
+           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
   }
 
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -10,8 +10,8 @@
 
 ; OPT-LABEL: @lshr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -30,8 +30,8 @@
 
 ; OPT-LABEL: @ashr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = ashr i32 %lid, 5
@@ -50,8 +50,96 @@
 
 ; OPT-LABEL: @and_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %and = and i32 %lid, -32
+  %div4 = zext i32 %and to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_no_dim_info:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_no_dim_info
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_no_dim_info(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_3d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_3d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @lshr_threadid_1d_uneven
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !3 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @and_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %lid, -32
@@ -64,3 +152,8 @@
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{i32 64, i32 1, i32 1}
+!1 = !{i32 65, i32 2, i32 1}
+!2 = !{i32 64, i32 1, i32 2}
+!3 = !{i32 65, i32 1, i32 1}