Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -328,12 +328,12 @@
     return HasMadMixInsts;
   }
 
-  bool hasSBufferLoadStoreAtomicDwordxN() const {
+  bool hasBuggySBufferLoadStoreAtomicxN() const {
     // Only use the "x1" variants on GFX9 or don't use the buffer variants.
     // For x2 and higher variants, if the accessed region spans 2 VM pages and
     // the second page is unmapped, the hw hangs.
     // TODO: There is one future GFX9 chip that doesn't have this bug.
-    return getGeneration() != GFX9;
+    return getGeneration() == GFX9;
   }
 
   bool hasCARRY() const {
Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -804,6 +804,8 @@
 // the same base register. We rely on the scheduler to do the hard work of
 // clustering nearby loads, and assume these are all adjacent.
 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+  const SIMachineFunctionInfo *MFI =
+    MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   bool Modified = false;
 
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
@@ -849,7 +851,8 @@
 
       continue;
     }
-    if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
+    if ((!STM->hasBuggySBufferLoadStoreAtomicxN() ||
+         MFI->shrinkBuggySBufferLoadStoreAtomicxN()) &&
         (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
          Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
       // EltSize is in units of the offset encoding.
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -181,6 +181,14 @@
   // user arguments. This is an offset from the KernargSegmentPtr.
   bool ImplicitArgPtr : 1;
 
+  // This converts s_buffer_xxx to s_xxx to allow xN loads on chips where
+  // the buffer opcodes are buggy, but at the cost of removing bounds checking
+  // that is provided by buffer opcodes.
+  //
+  // Constraint: Only the BASE_ADDRESS_HI field of WORD1 can be set, so that
+  // WORD0:WORD1 can trivially be used as an address.
+  bool ShrinkBuggySBufferLoadStoreAtomicxN : 1;
+
   // The hard-wired high half of the address of the global information table
   // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
   // current hardware only allows a 16 bit value.
@@ -392,6 +400,10 @@
     return ImplicitBufferPtr;
   }
 
+  unsigned shrinkBuggySBufferLoadStoreAtomicxN() const {
+    return ShrinkBuggySBufferLoadStoreAtomicxN;
+  }
+
   AMDGPUFunctionArgInfo &getArgInfo() {
     return ArgInfo;
   }
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -47,6 +47,7 @@
     WorkItemIDZ(false),
     ImplicitBufferPtr(false),
     ImplicitArgPtr(false),
+    ShrinkBuggySBufferLoadStoreAtomicxN(false),
     GITPtrHigh(0xffffffff),
     HighBitsOf32BitAddress(0) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -161,6 +162,9 @@
       FlatScratchInit = true;
   }
 
+  if (F.hasFnAttribute("amdgpu-shrink-buggy-sbuffer-opcodes"))
+    ShrinkBuggySBufferLoadStoreAtomicxN = true;
+
   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
   StringRef S = A.getValueAsString();
   if (!S.empty())
Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp
===================================================================
--- lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -293,6 +293,7 @@
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   std::vector<unsigned> I1Defs;
 
@@ -305,6 +306,51 @@
       Next = std::next(I);
       MachineInstr &MI = *I;
 
+      // Shrink buggy scalar buffer loads.
+      if (ST.hasBuggySBufferLoadStoreAtomicxN() &&
+          MFI->shrinkBuggySBufferLoadStoreAtomicxN() &&
+          TII->isSMRD(MI.getOpcode())) {
+        unsigned NewOpcode = 0;
+
+        // No other s_buffer opcodes can be generated by LLVM at the moment.
+        switch (MI.getOpcode()) {
+        case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX2_IMM;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX16_IMM;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX2_SGPR;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX4_SGPR;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX8_SGPR;
+          break;
+        case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+          NewOpcode = AMDGPU::S_LOAD_DWORDX16_SGPR;
+          break;
+        default:
+          continue;
+        }
+
+        unsigned SAddr = TII->buildExtractSubReg(MI, MRI, MI.getOperand(1),
+                                                 &AMDGPU::SReg_128RegClass,
+                                                 AMDGPU::sub0_sub1,
+                                                 &AMDGPU::SReg_64_XEXECRegClass);
+        MI.setDesc(TII->get(NewOpcode));
+        MI.getOperand(1).setReg(SAddr);
+        continue;
+      }
+
       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
         // If this has a literal constant source that is the same as the
         // reversed bits of an inline immediate, replace with a bitreverse of
Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -238,6 +238,27 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_imm_merged_shrunk:
+; GCN-NEXT: %bb.
+; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
+; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
+; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
+; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
+; GFX9-NEXT: s_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:1], 0x4
+; GFX9-NEXT: s_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:1], 0x1c
+define amdgpu_ps void @smrd_imm_merged_shrunk(<4 x i32> inreg %desc) #2 {
+main_body:
+  %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
+  %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8)
+  %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12)
+  %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16)
+  %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28)
+  %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
+  ret void
+}
+
 ; GCN-LABEL: {{^}}smrd_vgpr_merged:
 ; GCN-NEXT: %bb.
 ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
@@ -266,3 +287,4 @@
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-shrink-buggy-sbuffer-opcodes" }