Index: lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- lib/Target/AMDGPU/AMDGPU.td
+++ lib/Target/AMDGPU/AMDGPU.td
@@ -335,6 +335,13 @@
   "Reserve VGPRs for trap handler usage"
 >;
 
+def FeatureDebuggerEmitPrologue : SubtargetFeature<
+  "amdgpu-debugger-emit-prologue",
+  "DebuggerEmitPrologue",
+  "true",
+  "Emit debugger prologue"
+>;
+
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -42,6 +42,8 @@
       FlatUsed(false),
       ReservedVGPRFirst(0),
       ReservedVGPRCount(0),
+      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
+      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
       VCCUsed(false),
       CodeLen(0) {}
 
@@ -75,6 +77,14 @@
     // The number of consecutive VGPRs reserved.
     uint16_t ReservedVGPRCount;
 
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or uint16_t(-1) if the register is not used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or uint16_t(-1) if the register is not used or not
+    // known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR;
+
     // Bonus information for debugging.
     bool VCCUsed;
     uint64_t CodeLen;
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -198,6 +198,11 @@
       OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
                                   false);
 
+      OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+                                  Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+      OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
+                                  Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
                                   false);
@@ -443,6 +448,16 @@
     MaxVGPR += MFI->getDebuggerReserveTrapVGPRCount();
   }
 
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was specified.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
+
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
   ProgInfo.NumVGPR = MaxVGPR + 1;
@@ -662,6 +677,9 @@
   if (MFI->hasDispatchPtr())
     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
+  if (STM.debuggerSupported())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+
   if (STM.isXNACKEnabled())
     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
@@ -672,6 +690,10 @@
   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
   header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
   header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  header.debug_wavefront_private_segment_offset_sgpr =
+    KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+  header.debug_private_segment_buffer_sgpr =
+    KernelInfo.DebuggerPrivateSegmentBufferSGPR;
 
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -97,6 +97,7 @@
   bool EnableSIScheduler;
   bool DebuggerInsertNops;
   bool DebuggerReserveTrapVGPRs;
+  bool DebuggerEmitPrologue;
 
   std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
@@ -315,6 +316,11 @@
     return EnableSIScheduler;
   }
 
+  bool debuggerSupported() const {
+    return debuggerInsertNops() && debuggerReserveTrapVGPRs()
+             && debuggerEmitPrologue();
+  }
+
   bool debuggerInsertNops() const {
     return DebuggerInsertNops;
   }
@@ -323,6 +329,10 @@
     return DebuggerReserveTrapVGPRs;
   }
 
+  bool debuggerEmitPrologue() const {
+    return DebuggerEmitPrologue;
+  }
+
   bool dumpCode() const {
     return DumpCode;
   }
Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -99,6 +99,7 @@
       IsaVersion(ISAVersion0_0_0),
       EnableSIScheduler(false),
       DebuggerInsertNops(false), DebuggerReserveTrapVGPRs(false),
+      DebuggerEmitPrologue(false),
       FrameLowering(nullptr),
       GISel(),
       InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
Index: lib/Target/AMDGPU/SIFrameLowering.h
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.h
+++ lib/Target/AMDGPU/SIFrameLowering.h
@@ -27,6 +27,20 @@
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
+
+private:
+  /// \brief Builds debugger store operation.
+  void buildDebuggerStoreOp(MachineFunction &MF,
+                            MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DebuggerStoreOp,
+                            unsigned SrcReg,
+                            int ObjectIdx,
+                            unsigned ScratchRsrcReg,
+                            unsigned ScratchOffsetReg) const;
+
+  /// \brief Emits debugger prologue.
+  void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 };
 
 }
Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -37,6 +37,12 @@
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
+  // specified.
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.debuggerEmitPrologue())
+    emitDebuggerPrologue(MF, MBB);
+
   if (!MF.getFrameInfo()->hasStackObjects())
     return;
 
@@ -55,7 +61,6 @@
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineBasicBlock::iterator I = MBB.begin();
 
@@ -86,6 +91,8 @@
     // pointer. Because we only detect if flat instructions are used at all,
     // this will be used more often than necessary on VI.
 
+    // Debug location must be unknown since the first debug location is used to
+    // determine the end of the prologue.
     DebugLoc DL;
 
     unsigned FlatScratchInitReg
@@ -282,3 +289,52 @@
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
+
+void SIFrameLowering::buildDebuggerStoreOp(MachineFunction &MF,
+                                           MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           unsigned DebuggerStoreOp,
+                                           unsigned SrcReg,
+                                           int ObjectIdx,
+                                           unsigned ScratchRsrcReg,
+                                           unsigned ScratchOffsetReg) const {
+  MF.getRegInfo().addLiveIn(SrcReg);
+  MBB.addLiveIn(SrcReg);
+
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, ObjectIdx);
+  unsigned Size = MF.getFrameInfo()->getObjectSize(ObjectIdx);
+  unsigned Align = MF.getFrameInfo()->getObjectAlignment(ObjectIdx);
+  MachineMemOperand *MemOperand =
+    MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align);
+  BuildMI(MBB, I, DebugLoc(), TII->get(DebuggerStoreOp))
+    .addReg(SrcReg)
+    .addFrameIndex(ObjectIdx)
+    .addReg(ScratchRsrcReg)
+    .addReg(ScratchOffsetReg)
+    .addMemOperand(MemOperand);
+}
+
+void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock::iterator I = MBB.begin();
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Store work group ID.
+    buildDebuggerStoreOp(MF, MBB, I, AMDGPU::SI_DEBUGGER_STORE_WGID,
+      MFI->getWorkGroupIDSGPR(i),
+      MFI->getDebuggerWorkGroupIDStackObjectIndex(i),
+      MFI->getScratchRSrcReg(),
+      MFI->getScratchWaveOffsetReg());
+    // Store work item ID.
+    buildDebuggerStoreOp(MF, MBB, I, AMDGPU::SI_DEBUGGER_STORE_WIID,
+      MFI->getWorkItemIDVGPR(i),
+      MFI->getDebuggerWorkItemIDStackObjectIndex(i),
+      MFI->getScratchRSrcReg(),
+      MFI->getScratchWaveOffsetReg());
+  }
+}
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -671,6 +671,30 @@
     return SDValue();
   }
 
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  //
+  // Debugger prologue writes work group IDs and work item IDs to scratch memory
+  // at fixed location in the following format:
+  //   offset 0:  work group ID x
+  //   offset 4:  work group ID y
+  //   offset 8:  work group ID z
+  //   offset 16: work item ID x
+  //   offset 20: work item ID y
+  //   offset 24: work item ID z
+  if (ST.debuggerEmitPrologue()) {
+    int ObjectIdx = 0;
+    // For each dimension:
+    for (unsigned i = 0; i < 3; ++i) {
+      // Create fixed stack object for work group ID.
+      ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+      Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+      // Create fixed stack object for work item ID.
+      ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+      Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+    }
+  }
+
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
 
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -1901,6 +1901,23 @@
 //===----------------------------------------------------------------------===//
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
+let mayStore = 1, UseNamedOperandTable = 1, Uses = [EXEC] in {
+  // Used for storing work group ID.
+  def SI_DEBUGGER_STORE_WGID: InstSI <
+    (outs),
+    (ins SReg_32:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+         SReg_32:$scratch_offset),
+    "", []
+  >;
+  // Used for storing work item ID.
+  def SI_DEBUGGER_STORE_WIID: InstSI <
+    (outs),
+    (ins VGPR_32:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
+         SReg_32:$scratch_offset),
+    "", []
+  >;
+} // End let mayStore = 1, UseNamedOperandTable = 1, Uses = [EXEC]
+
 // For use in patterns
 def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
   (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -64,6 +64,10 @@
 
   // Number of reserved VGPRs for trap handler usage.
   unsigned DebuggerReserveTrapVGPRCount;
+  // Stack object indices for work group IDs.
+  int DebuggerWorkGroupIDStackObjectIndices[3];
+  // Stack object indices for work item IDs.
+  int DebuggerWorkItemIDStackObjectIndices[3];
 
 public:
   // FIXME: Make private
@@ -333,6 +337,62 @@
     return DebuggerReserveTrapVGPRCount;
   }
 
+  /// \returns Stack object index for \p Dim's work group ID.
+  int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkGroupIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns Stack object index for \p Dim's work item ID.
+  int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkItemIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns SGPR used for \p Dim's work group ID.
+  unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkGroupIDX());
+      return WorkGroupIDXSystemSGPR;
+    case 1:
+      assert(hasWorkGroupIDY());
+      return WorkGroupIDYSystemSGPR;
+    case 2:
+      assert(hasWorkGroupIDZ());
+      return WorkGroupIDZSystemSGPR;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
+  /// \returns VGPR used for \p Dim' work item ID.
+  unsigned getWorkItemIDVGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkItemIDX());
+      return AMDGPU::VGPR0;
+    case 1:
+      assert(hasWorkItemIDY());
+      return AMDGPU::VGPR1;
+    case 2:
+      assert(hasWorkItemIDZ());
+      return AMDGPU::VGPR2;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
 
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -50,6 +50,8 @@
     ReturnsVoid(true),
     MaximumWorkGroupSize(0),
     DebuggerReserveTrapVGPRCount(0),
+    DebuggerWorkGroupIDStackObjectIndices{0, 0, 0},
+    DebuggerWorkItemIDStackObjectIndices{0, 0, 0},
     LDSWaveSpillSize(0),
     PSInputEna(0),
     NumUserSGPRs(0),
@@ -88,16 +90,16 @@
     WorkItemIDX = true;
   }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
     WorkGroupIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
     WorkItemIDY = true;
 
-  if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
 
   // X, XY, and XYZ are the only supported combinations, so make sure Y is
@@ -106,7 +108,7 @@
     WorkItemIDY = true;
 
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo->hasStackObjects();
+  bool HasStackObjects = FrameInfo->hasStackObjects() || ST.debuggerEmitPrologue();
 
   if (HasStackObjects || MaySpill)
     PrivateSegmentWaveByteOffset = true;
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -396,6 +396,8 @@
   case AMDGPU::SI_SPILL_S32_RESTORE:
   case AMDGPU::SI_SPILL_V32_SAVE:
   case AMDGPU::SI_SPILL_V32_RESTORE:
+  case AMDGPU::SI_DEBUGGER_STORE_WGID:
+  case AMDGPU::SI_DEBUGGER_STORE_WIID:
     return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
@@ -496,6 +498,30 @@
   int Index = MI->getOperand(FIOperandNum).getIndex();
 
   switch (MI->getOpcode()) {
+    // Build scratch store for work group ID.
+    case AMDGPU::SI_DEBUGGER_STORE_WGID: {
+      unsigned SrcVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SrcVGPR)
+        .addReg(TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg());
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, SrcVGPR,
+        TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+        TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+        FrameInfo->getObjectOffset(Index), RS);
+      MI->eraseFromParent();
+      break;
+    }
+
+    // Build scratch store for work item ID.
+    case AMDGPU::SI_DEBUGGER_STORE_WIID: {
+      buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+        TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+        TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
+        TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+        FrameInfo->getObjectOffset(Index), RS);
+      MI->eraseFromParent();
+      break;
+    }
+
     // SGPR register spill
     case AMDGPU::SI_SPILL_S512_SAVE:
     case AMDGPU::SI_SPILL_S256_SAVE:
Index: test/CodeGen/AMDGPU/debugger-emit-prologue.ll
===================================================================
--- test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -0,0 +1,76 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]]
+; CHECK: debug_private_segment_buffer_sgpr = [[SREG:[0-9]+]]
+
+; CHECK: v_mov_b32_e32 [[WGIDX:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDX]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]]
+; CHECK: buffer_store_dword v0, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:16
+
+; CHECK: v_mov_b32_e32 [[WGIDY:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDY]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:4
+; CHECK: buffer_store_dword v1, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:20
+
+; CHECK: v_mov_b32_e32 [[WGIDZ:v[0-9]+]], s{{[0-9]+}}
+; CHECK: buffer_store_dword [[WGIDZ]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:8
+; CHECK: buffer_store_dword v2, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:24
+
+; CHECK: DebuggerWavefrontPrivateSegmentOffsetSGPR: s[[SOFF]]
+; CHECK: DebuggerPrivateSegmentBufferSGPR: s[[SREG]]
+
+; Function Attrs: nounwind
+define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24
+  store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25
+  ret void, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!3}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 269772)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing")
+!2 = !{}
+!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0 (trunk 269772)"}
+!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32)
+!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15)
+!18 = !DIExpression()
+!19 = !DILocation(line: 1, column: 30, scope: !12)
+!20 = !DILocation(line: 2, column: 3, scope: !12)
+!21 = !DILocation(line: 2, column: 8, scope: !12)
+!22 = !DILocation(line: 3, column: 3, scope: !12)
+!23 = !DILocation(line: 3, column: 8, scope: !12)
+!24 = !DILocation(line: 4, column: 3, scope: !12)
+!25 = !DILocation(line: 4, column: 8, scope: !12)
+!26 = !DILocation(line: 5, column: 1, scope: !12)