diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3220,7 +3220,7 @@
   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
-    return false;
+    return Register();
 
   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
     return Def->getOperand(1).getReg();
@@ -3822,27 +3822,36 @@
   getAddrModeInfo(*MI, *MRI, AddrInfo);
 
   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
-  // then we can select all ptr + 32-bit offsets not just immediate offsets.
-  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+  // then we can select all ptr + 32-bit offsets.
+  if (AddrInfo.empty())
     return None;
 
   const GEPInfo &GEPInfo = AddrInfo[0];
+  Register PtrReg = GEPInfo.SgprParts[0];
+
   // SGPR offset is unsigned.
-  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
-    return None;
+  if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) &&
+      GEPInfo.Imm != 0) {
+    // If we make it this far we have a load with an 32-bit immediate offset.
+    // It is OK to select this using a sgpr offset, because we have already
+    // failed trying to select this load into one of the _IMM variants since
+    // the _IMM Patterns are considered before the _SGPR patterns.
+    Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+        .addImm(GEPInfo.Imm);
+    return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+             [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+  }
 
-  // If we make it this far we have a load with an 32-bit immediate offset.
-  // It is OK to select this using a sgpr offset, because we have already
-  // failed trying to select this load into one of the _IMM variants since
-  // the _IMM Patterns are considered before the _SGPR patterns.
-  Register PtrReg = GEPInfo.SgprParts[0];
-  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
-          .addImm(GEPInfo.Imm);
-  return {{
-    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-    [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
-  }};
+  if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) {
+    if (Register OffsetReg =
+            matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) {
+      return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+               [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+    }
+  }
+
+  return None;
 }
 
 std::pair<Register, int>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -6,6 +6,7 @@
   define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void }
   define amdgpu_kernel void @smrd_wide() { ret void }
   define amdgpu_kernel void @constant_address_positive() { ret void }
+  define amdgpu_kernel void @smrd_sgpr() { ret void }
 ...
 ---
 
@@ -210,3 +211,24 @@
     %3:sgpr(s32) = G_LOAD %2 :: (dereferenceable invariant load (s32), align 4, addrspace 4)
     S_ENDPGM 0, implicit %3
 ...
+
+---
+
+# Test a load with a register offset.
+# GCN-LABEL: name: smrd_sgpr{{$}}
+# GCN: S_LOAD_DWORD_SGPR %0, %1, 0
+
+name:            smrd_sgpr
+legalized:       true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    %0:sgpr(p4) = COPY $sgpr0_sgpr1
+    %1:sgpr(s32) = COPY $sgpr2
+    %2:sgpr(s64) = G_ZEXT %1:sgpr(s32)
+    %4:sgpr(p4) = G_PTR_ADD %0, %2
+    %5:sgpr(s32) = G_LOAD %4 :: (dereferenceable invariant load (s32), align 4, addrspace 4)
+    S_ENDPGM 0, implicit %5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -2,10 +2,14 @@
 ; from a register.
 
 ; RUN: llc -march=amdgcn -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -global-isel -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GISEL %s
 
 ; GCN: %[[OFFSET:[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer
 ; GCN: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0 :: (invariant load (s128) from %ir.13, addrspace 4)
 
+; GISEL: $[[OFFSET:.*]] = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer
+; GISEL: S_LOAD_DWORDX4_SGPR killed renamable {{.*}}, killed renamable $[[OFFSET]], 0 :: (invariant load (<4 x s32>) from {{.*}}, addrspace 4)
+
 define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %resNode0, i32 inreg %resNode1, <3 x i32> inreg %2, i32 inreg %3, <3 x i32> %4) local_unnamed_addr #2 {
 .entry:
   %5 = call i64 @llvm.amdgcn.s.getpc() #3