Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -587,6 +587,8 @@
 
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
 
 
   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -13,6 +13,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -487,9 +488,43 @@
   }
 }
 
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   if (FuncInfo->isEntryFunction()) {
     emitEntryFunctionPrologue(MF, MBB);
     return;
@@ -498,6 +533,7 @@
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -505,8 +541,36 @@
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
+  // XXX - Is this the right predicate?
+
   bool NeedFP = hasFP(MF);
-  if (NeedFP) {
+  uint32_t NumBytes = MFI.getStackSize();
+  uint32_t RoundedSize = NumBytes;
+  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
+
+  if (NeedsRealignment) {
+    assert(NeedFP);
+    const unsigned Alignment = MFI.getMaxAlignment();
+    const unsigned ZeroLowBits = countTrailingZeros(Alignment);
+    assert(ZeroLowBits > 1);
+
+    RoundedSize += Alignment;
+
+    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+    assert(ScratchSPReg != AMDGPU::NoRegister);
+
+    // s_add_u32 tmp_reg, s32, NumBytes
+    // s_and_b32 s32, tmp_reg, 0b111...0000
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
+      .addReg(StackPtrReg)
+      .addImm((Alignment - 1) * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
+      .addReg(ScratchSPReg, RegState::Kill)
+      .addImm(-Alignment * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+    FuncInfo->setIsStackRealigned(true);
+  } else if (NeedFP) {
     // If we need a base pointer, set it up here. It's whatever the value of
     // the stack pointer is at this point. Any variable size objects will be
     // allocated after this, so we can still use the base pointer to reference
@@ -516,11 +580,10 @@
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  uint32_t NumBytes = MFI.getStackSize();
-  if (NumBytes != 0 && hasSP(MF)) {
+  if (RoundedSize != 0 && hasSP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
+      .addImm(RoundedSize * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -566,10 +629,12 @@
   // it's really whether we need SP to be accurate or not.
 
   if (NumBytes != 0 && hasSP(MF)) {
+    uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+      NumBytes + MFI.getMaxAlignment() : NumBytes;
+
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(NumBytes * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameDestroy);
+      .addImm(RoundedSize * ST.getWavefrontSize());
   }
 }
 
@@ -759,7 +824,8 @@
 }
 
 bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
   // All stack operations are relative to the frame offset SGPR.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
 }
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -142,6 +142,7 @@
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
+  bool IsStackRealigned = false;
 
   unsigned NumSpilledSGPRs = 0;
   unsigned NumSpilledVGPRs = 0;
@@ -495,6 +496,14 @@
     HasNonSpillStackObjects = StackObject;
   }
 
+  bool isStackRealigned() const {
+    return IsStackRealigned;
+  }
+
+  void setIsStackRealigned(bool Realigned = true) {
+    IsStackRealigned = Realigned;
+  }
+
   unsigned getNumSpilledSGPRs() const {
     return NumSpilledSGPRs;
   }
Index: test/CodeGen/AMDGPU/stack-realign.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/stack-realign.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that we properly realign the stack. While 4-byte access is all
+; that is ever needed, some transformations rely on the known bits from the alignment of the pointer (e.g.
+
+
+; 128 byte object
+; 4 byte emergency stack slot
+; = 144 bytes with padding between them
+
+; GCN-LABEL: {{^}}needs_align16_default_stack_align:
+; GCN: s_mov_b32 s5, s32
+; GCN-NOT: s32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN-NOT: s32
+
+; GCN: ; ScratchSize: 144
+define void @needs_align16_default_stack_align(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align16_stack_align4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffffc00
+; GCN: s_add_u32 s32, s32, 0x2800{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x2800
+
+; GCN: ; ScratchSize: 160
+define void @needs_align16_stack_align4(i32 %idx) #2 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}needs_align32:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
+; GCN: s_and_b32 s5, s6, 0xfffff800
+; GCN: s_add_u32 s32, s32, 0x3000{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+
+; GCN: s_sub_u32 s32, s32, 0x3000
+
+; GCN: ; ScratchSize: 192
+define void @needs_align32(i32 %idx) #0 {
+  %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x <4 x i32>], [8 x <4 x i32>] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %gep0, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}force_realign4:
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
+; GCN: s_and_b32 s5, s6, 0xffffff00
+; GCN: s_add_u32 s32, s32, 0xd00{{$}}
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen
+; GCN: s_sub_u32 s32, s32, 0xd00
+
+; GCN: ; ScratchSize: 52
+define void @force_realign4(i32 %idx) #1 {
+  %alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %alloca.align16, i32 0, i32 %idx
+  store volatile i32 3, i32 addrspace(5)* %gep0, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align16_from_8:
+; GCN: s_add_u32 s32, s8, 0x400{{$}}
+; GCN-NOT: s32
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
+  %alloca = alloca i32, align 4, addrspace(5)
+  store volatile i32 2, i32 addrspace(5)* %alloca
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; The call sequence should keep the stack on call aligned to 4
+; GCN-LABEL: {{^}}kernel_call_align16_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align16_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_default_stack_align(i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_align4_from_5:
+; GCN: s_add_u32 s32, s8, 0x400
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kernel_call_align4_from_5() {
+  %alloca0 = alloca i8, align 1, addrspace(5)
+  store volatile i8 2, i8  addrspace(5)* %alloca0
+
+  call void @needs_align16_stack_align4(i32 1)
+  ret void
+}
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { noinline nounwind "stackrealign" }
+attributes #2 = { noinline nounwind alignstack=4 }