Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2649,6 +2649,27 @@
   return DAG.getUNDEF(VT);
 }
 
+static SDValue LowerLocalId(SDValue Op, enum SIRegisterInfo::PreloadedValue Opc,
+                            SelectionDAG& DAG) {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SITargetLowering &TLI = (const SITargetLowering&)
+                                  DAG.getTargetLoweringInfo();
+
+  SDValue Res = TLI.CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+                                        TRI->getPreloadedValue(MF, Opc), VT);
+
+  unsigned Size = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+  if (!Size || Size > 0xFFFFu)
+    return Res;
+
+  SDValue NarrowVT = DAG.getValueType((Size <= 256) ? MVT::i8 : MVT::i16);
+  return DAG.getNode(ISD::AssertZext, SL, VT, Res, NarrowVT);
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2789,16 +2810,13 @@
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
   case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+    return LowerLocalId(Op, SIRegisterInfo::WORKITEM_ID_X, DAG);
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+    return LowerLocalId(Op, SIRegisterInfo::WORKITEM_ID_Y, DAG);
   case Intrinsic::amdgcn_workitem_id_z:
   case Intrinsic::r600_read_tidig_z:
-    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+    return LowerLocalId(Op, SIRegisterInfo::WORKITEM_ID_Z, DAG);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
       Op.getOperand(1),
Index: test/CodeGen/AMDGPU/add.i16.ll
===================================================================
--- test/CodeGen/AMDGPU/add.i16.ll
+++ test/CodeGen/AMDGPU/add.i16.ll
@@ -84,10 +84,10 @@
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/bfe-patterns.ll
===================================================================
--- test/CodeGen/AMDGPU/bfe-patterns.ll
+++ test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -50,7 +50,7 @@
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -128,7 +128,7 @@
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -150,7 +150,7 @@
 ; Do scalar loads into the super register we need.
 ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
@@ -173,7 +173,7 @@
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -234,8 +234,8 @@
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -248,8 +248,8 @@
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,8 +355,8 @@
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -370,8 +370,8 @@
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -206,8 +206,8 @@
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -221,8 +221,8 @@
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
@@ -348,8 +348,8 @@
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -362,8 +362,8 @@
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/local-memory.amdgcn.ll
===================================================================
--- test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -45,11 +45,7 @@
 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
-
-; SI-DAG: ds_write_b32 [[ADDRW]],
-; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 
 ; GCN: s_barrier
 
Index: test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
===================================================================
--- test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -14,8 +14,8 @@
 }
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
 define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -26,8 +26,8 @@
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
 ; CHECK-NOT: v0
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
 define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
===================================================================
--- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -56,7 +56,7 @@
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -113,5 +113,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
Index: test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
===================================================================
--- test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -42,7 +42,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -58,7 +58,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -106,7 +106,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -122,7 +122,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -138,7 +138,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -156,7 +156,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -383,5 +383,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
Index: test/CodeGen/AMDGPU/shl.ll
===================================================================
--- test/CodeGen/AMDGPU/shl.ll
+++ test/CodeGen/AMDGPU/shl.ll
@@ -4,6 +4,8 @@
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
+declare i32 @llvm.r600.read.tgid.x() #0
+
 
 ;EG: {{^}}shl_v2i32:
 ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -288,7 +290,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
 define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.r600.read.tgid.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep.in
Index: test/CodeGen/AMDGPU/sub.i16.ll
===================================================================
--- test/CodeGen/AMDGPU/sub.i16.ll
+++ test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,10 +85,10 @@
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/zext-lid.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/zext-lid.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+
+; CHECK-NOT: and_b32
+
+define amdgpu_kernel void @zext_grp_size_256(i32 addrspace(1)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 255
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 255
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 255
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 65535
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 65535
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 65535
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+declare i32 @llvm.amdgcn.workitem.id.y() #2
+
+declare i32 @llvm.amdgcn.workitem.id.z() #2
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
+attributes #2 = { nounwind readnone }