Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -150,6 +150,8 @@
     return "amdgpu-dispatch-ptr";
   case Intrinsic::amdgcn_dispatch_id:
     return "amdgpu-dispatch-id";
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+    return "amdgpu-kernarg-segment-ptr";
   case Intrinsic::amdgcn_queue_ptr:
   case Intrinsic::trap:
   case Intrinsic::debugtrap:
@@ -181,7 +183,8 @@
     { "amdgpu-work-group-id-y" },
     { "amdgpu-work-group-id-z" },
     { "amdgpu-dispatch-ptr" },
-    { "amdgpu-dispatch-id" }
+    { "amdgpu-dispatch-id" },
+    { "amdgpu-kernarg-segment-ptr" }
   };
 
   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -92,7 +92,7 @@
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    KernargSegmentPtr = true;
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
   } else if (CC == CallingConv::AMDGPU_PS) {
@@ -154,6 +154,9 @@
       PrivateMemoryInputPtr = true;
   }
 
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
+
   // We don't need to worry about accessing spills with flat instructions.
   // TODO: On VI where we must use flat for global, we should be able to omit
   // this if it is never used for generic access.
Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
===================================================================
--- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -8,6 +8,7 @@
 
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i64 @llvm.amdgcn.dispatch.id() #0
 
 ; HSA: define void @use_workitem_id_y() #1 {
@@ -182,6 +183,19 @@
   ret void
 }
 
+; HSA: define void @use_kernarg_segment_ptr() #12 {
+define void @use_kernarg_segment_ptr() #1 {
+  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_kernarg_segment_ptr() #12 {
+define void @func_indirect_use_kernarg_segment_ptr() #1 {
+  call void @use_kernarg_segment_ptr()
+  ret void
+}
+
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind "target-cpu"="fiji" }
 attributes #2 = { nounwind "target-cpu"="gfx900" }
@@ -198,3 +212,4 @@
 ; HSA: attributes #9 = { nounwind "target-cpu"="fiji" }
 ; HSA: attributes #10 = { nounwind "target-cpu"="gfx900" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
+; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
===================================================================
--- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -10,6 +10,7 @@
 
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 
 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
@@ -164,6 +165,15 @@
   ret void
 }
 
+; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 {
+define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %val = load i32, i32 addrspace(2)* %bc
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
 ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
@@ -236,3 +246,4 @@
 ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
+; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
Index: test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
===================================================================
--- test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -36,7 +36,7 @@
 ; CHECK-LABEL: {{^}}min_1024_max_2048
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 7
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_2048() #3 {
Index: test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
===================================================================
--- test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -118,7 +118,7 @@
 ; CHECK-LABEL: {{^}}exactly_10:
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
Index: test/CodeGen/AMDGPU/hsa.ll
===================================================================
--- test/CodeGen/AMDGPU/hsa.ll
+++ test/CodeGen/AMDGPU/hsa.ll
@@ -40,7 +40,7 @@
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .amdgpu_hsa_kernel simple
+; HSA-LABEL: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
@@ -65,3 +65,11 @@
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
+
+; HSA-LABEL: .amdgpu_hsa_kernel simple_no_kernargs
+; HSA: enable_sgpr_kernarg_segment_ptr = 0
+define amdgpu_kernel void @simple_no_kernargs() {
+entry:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -49,6 +49,18 @@
   ret void
 }
 
+; ALL-LABEL: {{^}}test_no_kernargs:
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
+define amdgpu_kernel void @test_no_kernargs() #1 {
+  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(2)* %gep
+  store volatile i32 %value, i32 addrspace(1)* undef
+  ret void
+}
+
 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
 
Index: test/CodeGen/AMDGPU/mubuf-offset-private.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -5,42 +5,42 @@
 ; Test addressing modes when the scratch base is not a frame index.
 
 ; GCN-LABEL: {{^}}store_private_offset_i8:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i8() #0 {
   store volatile i8 5, i8* inttoptr (i32 8 to i8*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i16:
-; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i16() #0 {
   store volatile i16 5, i16* inttoptr (i32 8 to i16*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i32:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i32() #0 {
   store volatile i32 5, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v2i32:
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v2i32() #0 {
   store volatile <2 x i32> <i32 5, i32 10>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v4i32:
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v4i32() #0 {
   store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i8:
-; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i8() #0 {
   %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
   ret void
@@ -65,7 +65,7 @@
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i16:
-; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i16() #0 {
   %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
   ret void
@@ -90,28 +90,28 @@
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i32:
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i32() #0 {
   %load = load volatile i32, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v2i32:
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v2i32() #0 {
   %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v4i32:
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v4i32() #0 {
   %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095
 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
   store volatile i8 5, i8* inttoptr (i32 4095 to i8*)
   ret void
@@ -119,7 +119,7 @@
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
   store volatile i8 5, i8* inttoptr (i32 4096 to i8*)
   ret void
@@ -127,7 +127,7 @@
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
   store volatile i8 5, i8* inttoptr (i32 4097 to i8*)
   ret void
Index: test/CodeGen/AMDGPU/private-access-no-objects.ll
===================================================================
--- test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -10,14 +10,14 @@
 ; GCN-LABEL: {{^}}store_to_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
 
 ; OPTNONE-NOT: s_mov_b32
-; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}}
 define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
@@ -26,7 +26,7 @@
 ; GCN-LABEL: {{^}}store_to_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 124 to i32*)
@@ -36,7 +36,7 @@
 ; GCN-LABEL: {{^}}load_from_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
@@ -46,7 +46,7 @@
 ; GCN-LABEL: {{^}}load_from_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 124 to i32*)
Index: test/CodeGen/AMDGPU/trap.ll
===================================================================
--- test/CodeGen/AMDGPU/trap.ll
+++ test/CodeGen/AMDGPU/trap.ll
@@ -19,11 +19,11 @@
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-LABEL: {{^}}hsa_trap:
 ; HSA-TRAP: enable_trap_handler = 1
@@ -45,11 +45,11 @@
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (): debugtrap handler not supported
 ; GCN-LABEL: {{^}}hsa_debugtrap: