diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -675,6 +675,7 @@
      Private                           5               private     scratch          32      0xFFFFFFFF
      Constant 32-bit                   6               *TODO*                               0x00000000
      Buffer Fat Pointer (experimental) 7               *TODO*
+     Streamout Registers               128             N/A         GS_REGS
      ================================= =============== =========== ================ ======= ============================
 
 **Generic**
@@ -783,6 +784,13 @@
   model the buffer descriptors used heavily in graphics workloads targeting
   the backend.
 
+**Streamout Registers**
+  Dedicated registers used by the GS NGG Streamout Instructions. The register
+  file is modelled as a memory in a distinct address space because it is indexed
+  by an address-like offset in place of named registers, and because register
+  accesses affect LGKMcnt. This is an internal address space used only by the
+  compiler. Do not use this address space for IR pointers.
+
 .. _amdgpu-memory-scopes:
 
 Memory Scopes
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1996,12 +1996,14 @@
 def int_amdgcn_ds_add_gs_reg_rtn :
   ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
   Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
-            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
+            "", [SDNPMemOperand]>;
 
 def int_amdgcn_ds_sub_gs_reg_rtn :
   ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
   Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
-            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
+            "", [SDNPMemOperand]>;
 
 def int_amdgcn_ds_bvh_stack_rtn :
   Intrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -385,6 +385,10 @@
 
     BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
 
+    /// Internal address spaces. Can be freely renumbered.
+    STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
+    /// end Internal address spaces.
+
     /// Address space for direct addressable parameter memory (CONST0).
     PARAM_D_ADDRESS = 6,
     /// Address space for indirect addressable parameter memory (VTX1).
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1096,6 +1096,15 @@
 
     return true;
   }
+  case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
+  case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
+    Info.ptrVal = nullptr;
+    Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
+    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
@@ -8,13 +8,7 @@
 define amdgpu_gs void @test_add_32(i32 %arg) {
 ; CHECK-LABEL: test_add_32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    s_endpgm
   %unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
   ret void
@@ -23,13 +17,8 @@
 define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_add_32_use:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    global_store_b32 v[1:2], v3, off
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
@@ -41,13 +30,7 @@
 define amdgpu_gs void @test_add_64(i32 %arg) {
 ; CHECK-LABEL: test_add_64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    s_endpgm
   %unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
   ret void
@@ -56,13 +39,8 @@
 define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_add_64_use:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    global_store_b64 v[1:2], v[3:4], off
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
@@ -8,13 +8,7 @@
 define amdgpu_gs void @test_sub_32(i32 %arg) {
 ; CHECK-LABEL: test_sub_32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    s_endpgm
   %unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
   ret void
@@ -23,13 +17,8 @@
 define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_sub_32_use:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    global_store_b32 v[1:2], v3, off
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm
@@ -41,13 +30,7 @@
 define amdgpu_gs void @test_sub_64(i32 %arg) {
 ; CHECK-LABEL: test_sub_64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    s_endpgm
   %unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
   ret void
@@ -56,13 +39,8 @@
 define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) {
 ; CHECK-LABEL: test_sub_64_use:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
 ; CHECK-NEXT:    ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT:    buffer_gl0_inv
-; CHECK-NEXT:    buffer_gl1_inv
 ; CHECK-NEXT:    global_store_b64 v[1:2], v[3:4], off
 ; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; CHECK-NEXT:    s_endpgm