Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -317,8 +317,11 @@
 
   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-
   assert(TII->isMUBUF(MI));
+  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
+         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
+         "should only be seeing frame offset relative FrameIndex");
+
 
   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
   int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -978,12 +981,72 @@
     }
 
     default: {
-      if (TII->isMUBUF(*MI)) {
+      const DebugLoc &DL = MI->getDebugLoc();
+      bool IsMUBUF = TII->isMUBUF(*MI);
+
+      if (!IsMUBUF &&
+          MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+        // Convert to an absolute stack address by finding the offset from the
+        // scratch wave base and scaling by the wave size.
+        //
+        // In an entry function/kernel the stack address is already the absolute
+        // address relative to the the scratch wave offset.
+
+        unsigned DiffReg
+          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
+        unsigned ResultReg = IsCopy ?
+          MI->getOperand(0).getReg() :
+          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
+          .addReg(MFI->getFrameOffsetReg())
+          .addReg(MFI->getScratchWaveOffsetReg());
+
+        int64_t Offset = FrameInfo.getObjectOffset(Index);
+        if (Offset == 0) {
+          // XXX - This never happens because of emergency scavenging slot at 0?
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg);
+        } else {
+          unsigned CarryOut
+            = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+          unsigned ScaledReg
+            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+          // XXX - Should this use a vector shift?
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+            .addReg(DiffReg, RegState::Kill)
+            .addImm(Log2_32(ST.getWavefrontSize()));
+
+          // TODO: Fold if use instruction is another add of a constant.
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+            .addReg(CarryOut, RegState::Define | RegState::Dead)
+            .addImm(Offset)
+            .addReg(ScaledReg, RegState::Kill);
+
+          MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
+        }
+
+        // Don't introduce an extra copy if we're just materializing in a mov.
+        if (IsCopy)
+          MI->eraseFromParent();
+        else
+          FIOp.ChangeToRegister(ResultReg, false, false, true);
+        return;
+      }
+
+      if (IsMUBUF) {
         // Disable offen so we don't need a 0 vgpr base.
         assert(static_cast<int>(FIOperandNum) ==
                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::vaddr));
 
+        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
+               == MFI->getFrameOffsetReg());
+
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm
           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
@@ -992,17 +1055,19 @@
         if (isUInt<12>(NewOffset) &&
             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
           MI->eraseFromParent();
-          break;
+          return;
         }
       }
 
+      // If the offset is simply too big, don't convert to a scratch wave offset
+      // relative index.
+
       int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
         unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*MBB, MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
-                .addImm(Offset);
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+          .addImm(Offset);
         FIOp.ChangeToRegister(TmpReg, false, false, true);
       }
     }
Index: test/CodeGen/AMDGPU/frame-index-elimination.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that non-entry function frame indices are expanded properly to
+; give an index relative to the scratch wave offset register
+
+; Materialize into a mov. Make sure there isn't an unnecessary copy.
+; GCN-LABEL: {{^}}func_mov_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 vcc_hi, s6, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_mov_fi_i32() #0 {
+  %alloca = alloca i32
+  store volatile i32* %alloca, i32* addrspace(3)* undef
+  ret void
+}
+
+; Materialize into an add of a constant offset from the FI.
+; FIXME: Should be able to merge adds
+
+; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 s7, s6, s4
+; GCN-NEXT: s_lshr_b32 s7, s7, 6
+; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s7, 4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_add_constant_to_fi_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
+  store volatile i32* %gep0, i32* addrspace(3)* undef
+  ret void
+}
+
+; A user the materialized frame index can't be meaningfully folded
+; into.
+
+; GCN-LABEL: {{^}}func_other_fi_user_i32:
+; GCN: s_sub_u32 vcc_hi, s6, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_other_fi_user_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %ptrtoint = ptrtoint [2 x i32]* %alloca to i32
+  %mul = mul i32 %ptrtoint, 9
+  store volatile i32 %mul, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
+; GCN: v_mov_b32_e32 v1, 15{{$}}
+; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
+define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
+  store volatile i32 15, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
+; GCN: s_waitcnt
+; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
+define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
+  %val = load volatile i32, i32* %ptr
+  ret void
+}
+
+attributes #0 = { nounwind }