diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4557,7 +4557,7 @@
 follows:
 
   - If it is known during instruction selection that there is stack usage,
-    SGPR0-3 is reserved for use as the scratch V#.  Stack usage is assumed if
+    SGPR36-39 is reserved for use as the scratch V#.  Stack usage is assumed if
     optimizations are disabled (``-O0``), if stack objects already exist (for
     locals, etc.), or if there are any function calls.
 
@@ -10514,7 +10514,7 @@
 
 On entry to a function:
 
-1.  SGPR0-3 contain a V# with the following properties (see
+1.  SGPR36-39 contain a V# with the following properties (see
     :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`):
 
     * Base address pointing to the beginning of the wavefront scratch backing
@@ -10529,14 +10529,14 @@
 5.  MODE register: *TBD*
 6.  VGPR0-31 and SGPR4-29 are used to pass function input arguments as described
     below.
-7.  SGPR30-31 return address (RA). The code address that the function must
+7.  SGPR34-35 return address (RA). The code address that the function must
     return to when it completes. The value is undefined if the function is *no
     return*.
 8.  SGPR32 is used for the stack pointer (SP). It is an unswizzled scratch
     offset relative to the beginning of the wavefront scratch backing memory.
 
     The unswizzled SP can be used with buffer instructions as an unswizzled SGPR
-    offset with the scratch V# in SGPR0-3 to access the stack in a swizzled
+    offset with the scratch V# in SGPR36-39 to access the stack in a swizzled
     manner.
 
     The unswizzled SP value can be converted into the swizzled SP value by:
@@ -10775,7 +10775,7 @@
     How are overly aligned structures allocated on the stack?
 
 * SGPR arguments are assigned to consecutive SGPRs starting at SGPR0 up to
-  SGPR29.
+  SGPR31.
 
   If there are more arguments than will fit in these registers, the remaining
   arguments are allocated on the stack in order on naturally aligned
@@ -10797,7 +10797,7 @@
 1.  SGPR33 is used as a frame pointer (FP) if necessary. Like the SP it is an
     unswizzled scratch address. It is only needed if runtime sized ``alloca``
     are used, or for the reasons defined in ``SIFrameLowering``.
-2.  Runtime stack alignment is supported. SGPR34 is used as a base pointer (BP)
+2.  Runtime stack alignment is supported. SGPR40 is used as a base pointer (BP)
     to access the incoming stack arguments in the function. The BP is needed
     only when the function requires the runtime stack alignment.
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1011,8 +1011,8 @@
     // copy.
     auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
                                                MFI->getScratchRSrcReg());
-    MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
-    MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+    MIRBuilder.buildCopy(TRI->getScratchRSrcReg(), ScratchRSrcReg);
+    MIB.addReg(TRI->getScratchRSrcReg(), RegState::Implicit);
   }
 
   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -17,14 +17,12 @@
 
 // Calling convention for SI
 def CC_SI_Gfx : CallingConv<[
-  // 0-3 are reserved for the stack buffer descriptor
-  // 30-31 are reserved for the return address
-  // 32 is reserved for the stack pointer
+  // SGPR32 onwards is reserved for the stack pointer, return address, etc.
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
-    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
-    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
   ]>>>,
 
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
@@ -41,15 +39,12 @@
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  // 0-3 are reserved for the stack buffer descriptor
-  // 32 is reserved for the stack pointer
+  // SGPR32 onwards is reserved for the stack pointer, return address, etc.
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
-    SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
     SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
-    SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
-    SGPR40, SGPR41, SGPR42, SGPR43
   ]>>>,
 
   CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3047,7 +3047,8 @@
       // In the HSA case, this should be an identity copy.
       SDValue ScratchRSrcReg
         = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
-      RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+      const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+      RegsToPass.emplace_back(TRI->getScratchRSrcReg(), ScratchRSrcReg);
       CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
       Chain = DAG.getTokenFactor(DL, CopyFromChains);
     }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -73,7 +73,7 @@
     if (!ST.enableFlatScratch()) {
       // Non-entry functions have no special inputs for now, other registers
       // required for scratch access.
-      ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+      ScratchRSrcReg = AMDGPU::SGPR36_SGPR37_SGPR38_SGPR39;
 
       ArgInfo.PrivateSegmentBuffer =
         ArgDescriptor::createRegister(ScratchRSrcReg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -259,6 +259,8 @@
 
   MCRegister getReturnAddressReg(const MachineFunction &MF) const;
 
+  Register getScratchRSrcReg() const;
+
   const TargetRegisterClass *
   getRegClassForSizeOnBank(unsigned Size,
                            const RegisterBank &Bank,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -406,7 +406,7 @@
   return MFI.getNumFixedObjects() && shouldRealignStack(MF);
 }
 
-Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
+Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR40; }
 
 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
   return CSR_AMDGPU_AllVGPRs_RegMask;
@@ -2319,7 +2319,11 @@
 
 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
   // Not a callee saved register.
-  return AMDGPU::SGPR30_SGPR31;
+  return AMDGPU::SGPR34_SGPR35;
+}
+
+Register SIRegisterInfo::getScratchRSrcReg() const {
+  return AMDGPU::SGPR36_SGPR37_SGPR38_SGPR39;
 }
 
 const TargetRegisterClass *