Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -155,8 +155,8 @@
   bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
                         SDValue &Offset, SDValue &SLC) const;
 
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
-                        bool &Imm) const;
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, bool Addr32Bit,
+                        SDValue &Offset, bool &Imm) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
                   bool &Imm) const;
@@ -1346,7 +1346,8 @@
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
-                                          SDValue &Offset, bool &Imm) const {
+                                          bool Addr32Bit, SDValue &Offset,
+                                          bool &Imm) const {
 
   // FIXME: Handle non-constant offsets.
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
@@ -1358,6 +1359,14 @@
   int64_t ByteOffset = C->getSExtValue();
   int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
+  // A 32-bit (address + offset) can wrap around, while the same expression
+  // might not wrap around in 64 bits. We need to fix this case for tests that
+  // do (Addr - BigConst + BigConst), so assume that a "sufficiently small"
+  // offset never causes a wraparound. The chosen number is a guess. Address
+  // wraparounds are never expected to occur in normal apps.
+  if (Addr32Bit && (uint64_t)ByteOffset > 256 * 1024)
+    return false;
+
   if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
     Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
     Imm = true;
@@ -1412,7 +1421,7 @@
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
-    if (SelectSMRDOffset(N1, Offset, Imm)) {
+    if (SelectSMRDOffset(N1, Addr.getValueType() == MVT::i32, Offset, Imm)) {
       SBase = Expand32BitAddress(N0);
       return true;
     }
@@ -1452,7 +1461,7 @@
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
                                              SDValue &Offset) const {
   bool Imm;
-  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+  return SelectSMRDOffset(Addr, false, Offset, Imm) && Imm;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
@@ -1461,7 +1470,7 @@
     return false;
 
   bool Imm;
-  if (!SelectSMRDOffset(Addr, Offset, Imm))
+  if (!SelectSMRDOffset(Addr, false, Offset, Imm))
     return false;
 
   return !Imm && isa<ConstantSDNode>(Offset);
Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll
===================================================================
--- test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI,CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s
 
@@ -268,6 +268,30 @@
   ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
 }
 
+; GCN-LABEL: {{^}}load_addr_fold:
+; GCN-DAG: s_mov_b32 s1, 0
+; SI-DAG: s_mov_b32 s2, 0x40000
+; SI: s_load_dword s{{[0-9]}}, s[0:1], s2
+; CI: s_load_dword s{{[0-9]}}, s[0:1], 0x10000
+; VIGFX9: s_load_dword s{{[0-9]}}, s[0:1], 0x40000
+define amdgpu_vs float @load_addr_fold(i32 addrspace(6)* inreg %p0) #0 {
+  %gep1 = getelementptr i32, i32 addrspace(6)* %p0, i64 65536
+  %r1 = load i32, i32 addrspace(6)* %gep1
+  %r2 = bitcast i32 %r1 to float
+  ret float %r2
+}
+
+; GCN-LABEL: {{^}}load_addr_no_fold:
+; GCN-DAG: s_add_i32 s0, s0, 0x40004
+; GCN-DAG: s_mov_b32 s1, 0
+; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0
+define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg %p0) #0 {
+  %gep1 = getelementptr i32, i32 addrspace(6)* %p0, i64 65537
+  %r1 = load i32, i32 addrspace(6)* %gep1
+  %r2 = bitcast i32 %r1 to float
+  ret float %r2
+}
+
 ; Function Attrs: nounwind readnone speculatable
 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6