Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -87,10 +87,11 @@
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const GCNSubtarget *ST;
+  const SIMachineFunctionInfo *MFI;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
-                   unsigned UseOpIdx,
+                   int UseOpIdx,
                    SmallVectorImpl<FoldCandidate> &FoldList,
                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
 
@@ -159,6 +160,17 @@
   }
 }
 
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+                              const MachineInstr &UseMI,
+                              int OpNo,
+                              const MachineOperand &OpToFold) {
+  return OpToFold.isFI() &&
+    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -290,7 +302,6 @@
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -403,7 +414,7 @@
 void SIFoldOperands::foldOperand(
   MachineOperand &OpToFold,
   MachineInstr *UseMI,
-  unsigned UseOpIdx,
+  int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -453,10 +464,28 @@
     return;
   }
 
+  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+    // Sanity check that this is a stack access.
+    // FIXME: Should probably use stack pseudos before frame lowering.
+    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
+      return;
+
+    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+        MFI->getScratchRSrcReg())
+      return;
 
-  bool FoldingImm = OpToFold.isImm();
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+    SOff->setReg(MFI->getStackPtrOffsetReg());
+    return;
+  }
 
-  if (FoldingImm && UseMI->isCopy()) {
+  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+  if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -517,7 +546,7 @@
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
-      if (FoldingImm) {
+      if (FoldingImmLike) {
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
@@ -528,7 +557,10 @@
 
         // FIXME: ChangeToImmediate should clear subreg
         UseMI->getOperand(1).setSubReg(0);
-        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        if (OpToFold.isImm())
+          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        else
+          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
         UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
@@ -560,7 +592,7 @@
       return;
   }
 
-  if (!FoldingImm) {
+  if (!FoldingImmLike) {
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -904,6 +936,9 @@
       // in some cases. A better heuristic is needed.
       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                    CopiesToReplace);
       } else {
         if (++NumLiteralUses == 1) {
           NonInlineUse = &*Use;
@@ -1170,8 +1205,7 @@
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -496,6 +496,11 @@
     return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
   }
 
+  // FIXME: Make this more precise
+  static bool isFLATScratch(const MachineInstr &MI) {
+    return isSegmentSpecificFLAT(MI);
+  }
+
   // Any FLAT encoded instruction, including global_* and scratch_*.
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
Index: test/CodeGen/AMDGPU/byval-frame-setup.ll
===================================================================
--- test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -27,6 +27,47 @@
   ret void
 }
 
+; Make sure the offset is folded and function's frame register is used
+; rather than the global scratch wave offset.
+; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
+; GCN-NOT: v_lshrrev_b32
+; GCN-NOT: s_sub_u32
+
+; GCN: s_and_saveexec_b64
+; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+
+; GCN: [[BB1]]
+; GCN: s_or_b64 exec, exec
+define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  br label %bb1
+
+bb1:
+  ret void
+}
+
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
 ; GCN: s_mov_b32 s5, s32
 ; GCN: s_add_u32 s32, s32, 0xc00{{$}}
Index: test/CodeGen/AMDGPU/fold-fi-mubuf.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/fold-fi-mubuf.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: no_fold_fi_non_stack_rsrc_soffset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr6'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %1:sreg_32_xm0 = S_MOV_B32 0
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+# Offset is from global scratch wave offset.
+---
+name: fold_fi_mubuf_scratch_scratch_wave_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: no_fold_fi_mubuf_scratch_sp_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
Index: test/CodeGen/AMDGPU/frame-index-elimination.ll
===================================================================
--- test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -144,9 +144,6 @@
   ret void
 }
 
-; FIXME: Should be able to see that this can use vaddr, but the
-; FrameIndex is hidden behind a CopyFromReg in the second block.
-
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
 ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
 
@@ -156,13 +153,13 @@
 
 ; GCN: s_and_saveexec_b64
 
-; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}}
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
-; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
+; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GCN: ds_write_b32
+; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -440,6 +440,32 @@
   ret float %val
 }
 
+; Make sure a frame index folding doessn't crash on a MUBUF not used
+; for stack access.
+
+; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
+  ret float %ret.val
+}
+
+; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
+; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
+  ret float %ret.val
+}
+
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
Index: test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -60,8 +60,7 @@
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]]
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32