Index: lib/Target/AMDGPU/AMDGPUCallLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -38,7 +38,8 @@
                    unsigned VReg) const override;
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 };
 } // End of namespace llvm;
 #endif
Index: lib/Target/AMDGPU/AMDGPUCallingConv.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -13,6 +13,8 @@
 
 // Inversion of CCIfInReg
 class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 // Calling convention for SI
 def CC_SI : CallingConv<[
@@ -52,7 +54,7 @@
   ]>>>
 ]>;
 
-def RetCC_SI : CallingConv<[
+def RetCC_SI_Shader : CallingConv<[
   CCIfType<[i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -99,6 +101,52 @@
   CCCustom<"allocateKernArg">
 ]>;
 
+def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 24, 255)
+>;
+
+def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 32, 255)
+>;
+
+def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
+  (sequence "SGPR%u", 32, 103)
+>;
+
+def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+>;
+
+// Calling convention for leaf functions
+def CC_AMDGPU_Func : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+// Calling convention for leaf functions
+def RetCC_AMDGPU_Func : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+]>;
+
 def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -115,9 +115,6 @@
                                     SmallVectorImpl<SDValue> &Results) const;
   void analyzeFormalArgumentsCompute(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeReturn(CCState &State,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
-
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
@@ -162,6 +159,8 @@
   bool isCheapToSpeculateCtlz() const override;
 
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
+
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -76,6 +76,45 @@
   }
 }
 
+// Allocate up to VGPR31.
+//
+// TODO: Since there are no VGPR alignent requirements would it be better to
+// split into individual scalar registers?
+static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_64RegClass, 31);
+  }
+  case MVT::v4i32:
+  case MVT::v4f32:
+  case MVT::v2i64:
+  case MVT::v2f64: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_128RegClass, 29);
+  }
+  case MVT::v8i32:
+  case MVT::v8f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_256RegClass, 25);
+
+  }
+  case MVT::v16i32:
+  case MVT::v16f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_512RegClass, 17);
+
+  }
+  default:
+    return false;
+  }
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -767,8 +806,42 @@
 //===---------------------------------------------------------------------===//
 
 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                  bool IsVarArg) const {
-  return CC_AMDGPU;
+                                                  bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return CC_AMDGPU;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return RetCC_SI_Shader;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return RetCC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
 }
 
 /// The SelectionDAGBuilder will automatically promote function arguments
@@ -868,18 +941,15 @@
   }
 }
 
-void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
-                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
-
-  State.AnalyzeReturn(Outs, RetCC_SI);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                  bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SDLoc &DL, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerReturn(
+  SDValue Chain, CallingConv::ID CallConv,
+  bool isVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  const SmallVectorImpl<SDValue> &OutVals,
+  const SDLoc &DL, SelectionDAG &DAG) const {
+  // FIXME: Fails for r600 tests
+  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
+  // "wave terminate should not have return values");
   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
@@ -890,20 +960,12 @@
 /// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                     bool IsVarArg) {
-  switch (CC) {
-  case CallingConv::C:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    return CC_AMDGPU;
-  default:
-    report_fatal_error("Unsupported calling convention.");
-  }
+  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
+}
+
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                      bool IsVarArg) {
+  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 }
 
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -380,6 +380,6 @@
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -126,9 +126,15 @@
 }
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  unsigned Opcode = MI->getOpcode();
 
-  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+  // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+  // need to select it to the subtarget specific version, and there's no way to
+  // do that with a single pseudo source operation.
+  if (Opcode == AMDGPU::S_SETPC_B64_return)
+    Opcode = AMDGPU::S_SETPC_B64;
 
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {
     LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
     C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
Index: lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,21 +12,6 @@
 
 using namespace llvm;
 
-static bool isEntryFunctionCC(CallingConv::ID CC) {
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    return true;
-  default:
-    return false;
-  }
-}
-
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
@@ -34,7 +19,7 @@
   MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
-  IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+  IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
   NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
Index: lib/Target/AMDGPU/AMDGPURegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -30,9 +30,6 @@
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
-
-  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIRegisterInfo.h"
 
 using namespace llvm;
 
@@ -24,18 +25,6 @@
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
-
-const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
-  const MachineFunction *) const {
-  return &CalleeSavedReg;
-}
-
-unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
-}
-
 unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   static const unsigned SubRegs[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
@@ -50,3 +39,35 @@
 
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
+
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_SaveList;
+  default: {
+    // Dummy to not crash RegisterClassInfo.
+    static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+    return &NoCalleeSavedReg;
+  }
+  }
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                                     CallingConv::ID CC) const {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_RegMask;
+  default:
+    return nullptr;
+  }
+}
+
+unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
Index: lib/Target/AMDGPU/R600RegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/R600RegisterInfo.h
+++ lib/Target/AMDGPU/R600RegisterInfo.h
@@ -27,6 +27,8 @@
   R600RegisterInfo();
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
Index: lib/Target/AMDGPU/R600RegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -56,6 +56,18 @@
   return Reserved;
 }
 
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
+  return &CalleeSavedReg;
+}
+
+unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
+
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
Index: lib/Target/AMDGPU/SIFrameLowering.h
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.h
+++ lib/Target/AMDGPU/SIFrameLowering.h
@@ -26,6 +26,8 @@
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override = default;
 
+  void emitEntryFunctionPrologue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
   void emitPrologue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -189,8 +189,6 @@
   // ----
   //  13 (+1)
   unsigned ReservedRegCount = 13;
-  if (SPReg != AMDGPU::NoRegister)
-    ++ReservedRegCount;
 
   if (AllSGPRs.size() < ReservedRegCount)
     return std::make_pair(ScratchWaveOffsetReg, SPReg);
@@ -208,13 +206,6 @@
         MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
         MFI->setScratchWaveOffsetReg(Reg);
         ScratchWaveOffsetReg = Reg;
-      } else {
-        if (SPReg == AMDGPU::NoRegister)
-          break;
-
-        MRI.replaceRegWith(SPReg, Reg);
-        MFI->setStackPtrOffsetReg(Reg);
-        SPReg = Reg;
         break;
       }
     }
@@ -223,8 +214,8 @@
   return std::make_pair(ScratchWaveOffsetReg, SPReg);
 }
 
-void SIFrameLowering::emitPrologue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
+void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) const {
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -424,6 +415,13 @@
   }
 }
 
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI->isEntryFunction())
+    emitEntryFunctionPrologue(MF, MBB);
+}
+
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
 
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,10 @@
                                    uint64_t Offset, bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
+  SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                              const SDLoc &SL, SDValue Chain,
+                              const ISD::InputArg &Arg) const;
+
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -178,7 +182,12 @@
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -918,6 +918,55 @@
   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
 }
 
+SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                                              const SDLoc &SL, SDValue Chain,
+                                              const ISD::InputArg &Arg) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (Arg.Flags.isByVal()) {
+    unsigned Size = Arg.Flags.getByValSize();
+    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
+    return DAG.getFrameIndex(FrameIdx, MVT::i32);
+  }
+
+  unsigned ArgOffset = VA.getLocMemOffset();
+  unsigned ArgSize = VA.getValVT().getStoreSize();
+
+  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
+
+  // Create load nodes to retrieve arguments from the stack.
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  SDValue ArgValue;
+
+  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  MVT MemVT = VA.getValVT();
+
+  switch (VA.getLocInfo()) {
+  default:
+    break;
+  case CCValAssign::BCvt:
+    MemVT = VA.getLocVT();
+    break;
+  case CCValAssign::SExt:
+    ExtType = ISD::SEXTLOAD;
+    break;
+  case CCValAssign::ZExt:
+    ExtType = ISD::ZEXTLOAD;
+    break;
+  case CCValAssign::AExt:
+    ExtType = ISD::EXTLOAD;
+    break;
+  }
+
+  ArgValue = DAG.getExtLoad(
+    ExtType, SL, VA.getLocVT(), Chain, FIN,
+    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+    MemVT);
+  return ArgValue;
+}
+
 static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
                                    CallingConv::ID CallConv,
                                    ArrayRef<ISD::InputArg> Ins,
@@ -1098,10 +1147,12 @@
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                      MachineFunction &MF,
                                      const SIRegisterInfo &TRI,
-                                     SIMachineFunctionInfo &Info) {
+                                     SIMachineFunctionInfo &Info,
+                                     bool NeedSP) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-  bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool HasStackObjects = MFI.hasStackObjects();
 
   // Record that we know we have non-spill stack objects so we don't need to
   // check all stack objects later.
@@ -1159,6 +1210,15 @@
       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
+
+  if (NeedSP){
+    unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
+    Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
+    assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
+                              Info.getStackPtrOffsetReg()));
+  }
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1227,8 +1287,10 @@
            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
            !Info->hasWorkItemIDZ());
+  } else if (IsKernel) {
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
-    assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
+    Splits.append(Ins.begin(), Ins.end());
   }
 
   if (IsEntryFunc) {
@@ -1282,11 +1344,14 @@
 
       InVals.push_back(Arg);
       continue;
+    } else if (!IsEntryFunc && VA.isMemLoc()) {
+      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
+      InVals.push_back(Val);
+      if (!Arg.Flags.isByVal())
+        Chains.push_back(Val.getValue(1));
+      continue;
     }
 
-    if (VA.isMemLoc())
-      report_fatal_error("memloc not supported with calling convention");
-
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
     unsigned Reg = VA.getLocReg();
@@ -1295,7 +1360,7 @@
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-    if (Arg.VT.isVector()) {
+    if (IsShader && Arg.VT.isVector()) {
       // Build a vector from the registers
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
@@ -1321,16 +1386,49 @@
     InVals.push_back(Val);
   }
 
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+  // TODO: Could maybe omit SP if only tail calls?
+  bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
+
   // Start adding system SGPRs.
-  if (IsEntryFunc)
+  if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-
-  reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
+  } else {
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+    CCInfo.AllocateReg(Info->getFrameOffsetReg());
+
+    if (NeedSP) {
+      unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
+      CCInfo.AllocateReg(StackPtrReg);
+      Info->setStackPtrOffsetReg(StackPtrReg);
+    }
+  }
 
   return Chains.empty() ? Chain :
     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
+// TODO: If return values can't fit in registers, we should return as many as
+// possible in registers before passing on stack.
+bool SITargetLowering::CanLowerReturn(
+  CallingConv::ID CallConv,
+  MachineFunction &MF, bool IsVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  LLVMContext &Context) const {
+  // Replacing returns with sret/stack usage doesn't make sense for shaders.
+  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
+  // for shaders. Vector types should be explicitly handled by CC.
+  if (AMDGPU::isEntryFunctionCC(CallConv))
+    return true;
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+}
+
 SDValue
 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
@@ -1340,11 +1438,15 @@
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (!AMDGPU::isShader(CallConv))
+  if (AMDGPU::isKernel(CallConv)) {
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
+  }
+
+  bool IsShader = AMDGPU::isShader(CallConv);
 
   Info->setIfReturnsVoid(Outs.size() == 0);
+  bool IsWaveEnd = Info->returnsVoid() && IsShader;
 
   SmallVector<ISD::OutputArg, 48> Splits;
   SmallVector<SDValue, 48> SplitVals;
@@ -1353,7 +1455,7 @@
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     const ISD::OutputArg &Out = Outs[i];
 
-    if (Out.VT.isVector()) {
+    if (IsShader && Out.VT.isVector()) {
       MVT VT = Out.VT.getVectorElementType();
       ISD::OutputArg NewOut = Out;
       NewOut.Flags.setSplit();
@@ -1384,29 +1486,58 @@
                  *DAG.getContext());
 
   // Analyze outgoing return values.
-  AnalyzeReturn(CCInfo, Splits);
+  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
 
   SDValue Flag;
   SmallVector<SDValue, 48> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
+  // Add return address for callable functions.
+  if (!Info->isEntryFunction()) {
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    // FIXME: Should be able to use a vreg here, but need a way to prevent it
+    // from being allcoated to a CSR.
+
+    SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+                                                MVT::i64);
+
+    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+    Flag = Chain.getValue(1);
+
+    RetOps.push_back(PhysReturnAddrReg);
+  }
+
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
+    // TODO: Partially return in registers if return values don't fit.
 
     SDValue Arg = SplitVals[realRVLocIdx];
 
     // Copied from other backends.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
     }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
@@ -1414,12 +1545,16 @@
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  // FIXME: Does sret work properly?
+
   // Update chain and glue.
   RetOps[0] = Chain;
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
+  unsigned Opc = AMDGPUISD::ENDPGM;
+  if (!IsWaveEnd)
+    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,9 +388,8 @@
   void setScratchWaveOffsetReg(unsigned Reg) {
     assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
-
-    // FIXME: Only for entry functions.
-    FrameOffsetReg = ScratchWaveOffsetReg;
+    if (isEntryFunction())
+      FrameOffsetReg = ScratchWaveOffsetReg;
   }
 
   unsigned getQueuePtrUserSGPR() const {
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -80,17 +80,22 @@
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
   WavesPerEU = ST.getWavesPerEU(*F);
 
-  // Non-entry functions have no special inputs for now.
-  // TODO: Return early for non-entry CCs.
+  if (!isEntryFunction()) {
+    // Non-entry functions have no special inputs for now, other registers
+    // required for scratch access.
+    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+    ScratchWaveOffsetReg = AMDGPU::SGPR4;
+    FrameOffsetReg = AMDGPU::SGPR5;
+    return;
+  }
 
   CallingConv::ID CC = F->getCallingConv();
-  if (CC == CallingConv::AMDGPU_PS)
-    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
-
-  if (AMDGPU::isKernel(CC)) {
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
     KernargSegmentPtr = true;
     WorkGroupIDX = true;
     WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
   }
 
   if (ST.debuggerEmitPrologue()) {
@@ -120,7 +125,7 @@
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo.hasStackObjects();
+  bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
 
   if (HasStackObjects || MaySpill) {
     PrivateSegmentWaveByteOffset = true;
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,6 +17,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
@@ -57,8 +58,16 @@
   unsigned reservedPrivateSegmentWaveByteOffsetReg(
     const MachineFunction &MF) const;
 
+  unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -228,6 +237,11 @@
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
+  unsigned getReturnAddressReg(const MachineFunction &MF) const {
+    // Not a callee saved register.
+    return AMDGPU::SGPR30_SGPR31;
+  }
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -117,11 +117,7 @@
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
-  const MachineFunction &MF) const {
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  unsigned RegCount = ST.getMaxNumSGPRs(MF);
+static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
   unsigned Reg;
 
   // Try to place it in a hole after PrivateSegmentBufferReg.
@@ -134,9 +130,22 @@
     // wave offset before it.
     Reg = RegCount - 5;
   }
+
+  return Reg;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
+unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
+  const MachineFunction &MF) const {
+  return AMDGPU::SGPR32;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -198,15 +207,33 @@
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+  if (StackPtrReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, StackPtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
+  }
+
+  unsigned FrameReg = MFI->getFrameOffsetReg();
+  if (FrameReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, FrameReg);
+    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
+  }
+
   return Reserved;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo().hasStackObjects();
+  const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
+  if (Info->isEntryFunction()) {
+    const MachineFrameInfo &MFI = Fn.getFrameInfo();
+    return MFI.hasStackObjects() || MFI.hasCalls();
+  }
+
+  // May need scavenger for dealing with callee saved registers.
+  return true;
 }
 
-bool
-SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects();
 }
 
Index: lib/Target/AMDGPU/SOPInstructions.td
===================================================================
--- lib/Target/AMDGPU/SOPInstructions.td
+++ lib/Target/AMDGPU/SOPInstructions.td
@@ -186,11 +186,23 @@
 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
 def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
 
-let isTerminator = 1, isBarrier = 1,
-    isBranch = 1, isIndirectBranch = 1 in {
+let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
+
+let isBranch = 1, isIndirectBranch = 1 in {
 def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
+} // End isBranch = 1, isIndirectBranch = 1
+
+let isReturn = 1 in {
+// Define variant marked as return rather than branch.
+def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+}
+} // End isTerminator = 1, isBarrier = 1
+
+let isCall = 1 in {
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
+>;
 }
-def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+
 def S_RFE_B64 : SOP1_1  <"s_rfe_b64">;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -262,7 +262,6 @@
 LLVM_READNONE
 inline bool isKernel(CallingConv::ID CC) {
   switch (CC) {
-  case CallingConv::C:
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
     return true;
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -518,7 +518,18 @@
 }
 
 bool isEntryFunctionCC(CallingConv::ID CC) {
-  return true;
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool isSI(const MCSubtargetInfo &STI) {
Index: test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
===================================================================
--- test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -6,7 +6,8 @@
 ; Tests for add.
 ; CHECK: name: addi32
 ; CHECK: {{%[0-9]+}}(s32) = G_ADD
-define i32 @addi32(i32 %arg1, i32 %arg2) {
+define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
   %res = add i32 %arg1, %arg2
-  ret i32 %res
+  store i32 %res, i32 addrspace(1)* undef
+  ret void
 }
Index: test/CodeGen/AMDGPU/frame-index-elimination.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that non-entry function frame indices are expanded properly to
+; give an index relative to the scratch wave offset register
+
+; Materialize into a mov. Make sure there isn't an unnecessary copy.
+; GCN-LABEL: {{^}}func_mov_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_mov_fi_i32() #0 {
+  %alloca = alloca i32
+  store volatile i32* %alloca, i32* addrspace(3)* undef
+  ret void
+}
+
+; Materialize into an add of a constant offset from the FI.
+; FIXME: Should be able to merge adds
+
+; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: s_lshr_b32 s6, s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_add_constant_to_fi_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
+  store volatile i32* %gep0, i32* addrspace(3)* undef
+  ret void
+}
+
+; A user the materialized frame index can't be meaningfully folded
+; into.
+
+; GCN-LABEL: {{^}}func_other_fi_user_i32:
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_other_fi_user_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %ptrtoint = ptrtoint [2 x i32]* %alloca to i32
+  %mul = mul i32 %ptrtoint, 9
+  store volatile i32 %mul, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
+; GCN: v_mov_b32_e32 v1, 15{{$}}
+; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
+define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
+  store volatile i32 15, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
+; GCN: s_waitcnt
+; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
+define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
+  %val = load volatile i32, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
+; GCN: s_waitcnt
+; GCN-NEXT: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load1 = load i32, i32* %gep1
+  store volatile i32* %gep1, i32* addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
+; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
+define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load0 = load i8, i8* %gep0
+  %load1 = load i32, i32* %gep1
+  store volatile i8 %load0, i8 addrspace(3)* undef
+  store volatile i32 %load1, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GCN: s_sub_u32 s8, s5, s4
+; GCN: v_lshr_b32_e64 v1, s8, 6
+; GCN: s_and_saveexec_b64
+
+; GCN: v_add_i32_e32 v0, vcc, 4, v1
+; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
+; GCN: ds_write_b32
+define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %bb, label %ret
+
+bb:
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load1 = load volatile i32, i32* %gep1
+  store volatile i32* %gep1, i32* addrspace(3)* undef
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/function-args.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/function-args.ll
@@ -0,0 +1,734 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}void_func_i1:
+; GCN: v_and_b32_e32 v0, 1, v0
+; GCN: buffer_store_byte v0, off
+define void @void_func_i1(i1 %arg0) #0 {
+  store i1 %arg0, i1 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i1_zeroext:
+; GCN: s_waitcnt
+; GCN-NEXT: v_or_b32_e32 v0, 12, v0
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
+  %ext = zext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i1_signext:
+; GCN: s_waitcnt
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i1_signext(i1 signext %arg0) #0 {
+  %ext = sext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8:
+; GCN-NOT: v0
+; GCN: buffer_store_byte v0, off
+define void @void_func_i8(i8 %arg0) #0 {
+  store i8 %arg0, i8 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8_zeroext:
+; GCN-NOT: and_b32
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
+  %ext = zext i8 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8_signext:
+; GCN-NOT: v_bfe_i32
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i8_signext(i8 signext %arg0) #0 {
+  %ext = sext i8 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16:
+; GCN: buffer_store_short v0, off
+define void @void_func_i16(i16 %arg0) #0 {
+  store i16 %arg0, i16 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16_zeroext:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
+  %ext = zext i16 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16_signext:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i16_signext(i16 signext %arg0) #0 {
+  %ext = sext i16 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i32(i32 %arg0) #0 {
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i64:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_i64(i64 %arg0) #0 {
+  store i64 %arg0, i64 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f16:
+; VI-NOT: v0
+; CI: v_cvt_f16_f32_e32 v0, v0
+; GCN: buffer_store_short v0, off
+define void @void_func_f16(half %arg0) #0 {
+  store half %arg0, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f32
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_f32(float %arg0) #0 {
+  store float %arg0, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f64:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_f64(double %arg0) #0 {
+  store double %arg0, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i32:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v2i32(<2 x i32> %arg0) #0 {
+  store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i32:
+; GCN-DAG: buffer_store_dword v2, off
+; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v3i32(<3 x i32> %arg0) #0 {
+  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i32:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v4i32(<4 x i32> %arg0) #0 {
+  store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dword v4, off
+define void @void_func_v5i32(<5 x i32> %arg0) #0 {
+  store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v8i32(<8 x i32> %arg0) #0 {
+  store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v16i32(<16 x i32> %arg0) #0 {
+  store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v32i32(<32 x i32> %arg0) #0 {
+  store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  ret void
+}
+
+; 1 over register limit
+; GCN-LABEL: {{^}}void_func_v33i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+; GCN: buffer_store_dword [[STACKLOAD]], off
+define void @void_func_v33i32(<33 x i32> %arg0) #0 {
+  store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i64:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v2i64(<2 x i64> %arg0) #0 {
+  store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx2 v[4:5], off
+define void @void_func_v3i64(<3 x i64> %arg0) #0 {
+  store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v4i64(<4 x i64> %arg0) #0 {
+  store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx2 v[8:9], off
+define void @void_func_v5i64(<5 x i64> %arg0) #0 {
+  store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v8i64(<8 x i64> %arg0) #0 {
+  store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v16i64(<16 x i64> %arg0) #0 {
+  store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i16:
+; GFX9-NOT: v0
+; GFX9: buffer_store_dword v0, off
+define void @void_func_v2i16(<2 x i16> %arg0) #0 {
+  store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i16:
+; GCN-DAG: buffer_store_dword v0, off
+; GCN-DAG: buffer_store_short v2, off
+define void @void_func_v3i16(<3 x i16> %arg0) #0 {
+  store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v4i16(<4 x i16> %arg0) #0 {
+  store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i16:
+; GCN-DAG: buffer_store_short v4, off,
+; GCN-DAG: buffer_store_dwordx2 v[1:2], off
+define void @void_func_v5i16(<5 x i16> %arg0) #0 {
+  store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i16:
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v8i16(<8 x i16> %arg0) #0 {
+  store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i16:
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v16i16(<16 x i16> %arg0) #0 {
+  store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f32:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v2f32(<2 x float> %arg0) #0 {
+  store <2 x float> %arg0, <2 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f32:
+; GCN-DAG: buffer_store_dword v2, off
+; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v3f32(<3 x float> %arg0) #0 {
+  store <3 x float> %arg0, <3 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f32:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v4f32(<4 x float> %arg0) #0 {
+  store <4 x float> %arg0, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v8f32(<8 x float> %arg0) #0 {
+  store <8 x float> %arg0, <8 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v16f32(<16 x float> %arg0) #0 {
+  store <16 x float> %arg0, <16 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f64:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v2f64(<2 x double> %arg0) #0 {
+  store <2 x double> %arg0, <2 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx2 v[4:5], off
+define void @void_func_v3f64(<3 x double> %arg0) #0 {
+  store <3 x double> %arg0, <3 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v4f64(<4 x double> %arg0) #0 {
+  store <4 x double> %arg0, <4 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v8f64(<8 x double> %arg0) #0 {
+  store <8 x double> %arg0, <8 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v16f64(<16 x double> %arg0) #0 {
+  store <16 x double> %arg0, <16 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f16:
+; GFX9-NOT: v0
+; GFX9: buffer_store_dword v0, off
+define void @void_func_v2f16(<2 x half> %arg0) #0 {
+  store <2 x half> %arg0, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f16:
+; GFX9-NOT: v0
+; GCN-DAG: buffer_store_dword v0, off
+; GCN-DAG: buffer_store_short v2, off
+define void @void_func_v3f16(<3 x half> %arg0) #0 {
+  store <3 x half> %arg0, <3 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9-NOT: v[0:1]
+; GFX9: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v4f16(<4 x half> %arg0) #0 {
+  store <4 x half> %arg0, <4 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v8f16(<8 x half> %arg0) #0 {
+  store <8 x half> %arg0, <8 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v16f16(<16 x half> %arg0) #0 {
+  store <16 x half> %arg0, <16 x half> addrspace(1)* undef
+  ret void
+}
+
+; Make sure there is no alignment requirement for passed vgprs.
+; GCN-LABEL: {{^}}void_func_i32_i64_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+; GCN: buffer_store_dwordx2 v[1:2]
+; GCN: buffer_store_dword v3
+define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i64 %arg1, i64 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_struct_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_struct_i32({ i32 } %arg0) #0 {
+  store { i32 } %arg0, { i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_struct_i8_i32:
+; GCN-DAG: buffer_store_byte v0, off
+; GCN-DAG: buffer_store_dword v1, off
+define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
+  store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
+; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v[[ELT1]]
+; GCN-DAG: buffer_store_byte v[[ELT0]]
+define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 {
+  %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0
+  store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
+; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN: ds_write_b32 v0, v0
+; GCN: s_setpc_b64
+define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 {
+  %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0
+  %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1
+  store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
+  store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
+; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
+define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 {
+  %arg0.load = load i32, i32* %arg0
+  %arg1.load = load i64, i64* %arg1
+  store i32 %arg0.load, i32 addrspace(1)* undef
+  store i64 %arg1.load, i64 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8
+
+; GCN: buffer_store_dword v[[LOAD_ARG1]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i64 %arg2, i64 addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Different ext load types on CI vs. VI
+; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
+; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
+; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
+
+; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
+; GCN: buffer_store_byte [[LOAD_ARG2]], off
+; GCN: buffer_store_short [[LOAD_ARG3]], off
+; VI: buffer_store_short [[LOAD_ARG4]], off
+
+; CI: buffer_store_short [[CVT_ARG4]], off
+define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i16 %arg3, i16 addrspace(1)* undef
+  store volatile half %arg4, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
+  store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
+; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
+; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GFX9: buffer_store_dword [[LOAD_ARG1]], off
+; GFX9: buffer_store_short [[LOAD_ARG2]], off
+define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
+  store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
+  store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
+  store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
+  store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}}
+define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
+  store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
+  ret void
+}
+
+; Check there is no crash.
+; GCN-LABEL: {{^}}void_func_v16i8:
+define void @void_func_v16i8(<16 x i8> %arg0) #0 {
+  store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
+  ret void
+}
+
+; Check there is no crash.
+; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
+define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/function-returns.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/function-returns.ll
@@ -0,0 +1,514 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}i1_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define i1 @i1_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; FIXME: Missing and?
+; GCN-LABEL: {{^}}i1_zeroext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i1 @i1_zeroext_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; GCN-LABEL: {{^}}i1_signext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
+; GCN-NEXT: s_setpc_b64
+define signext i1 @i1_signext_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; GCN-LABEL: {{^}}i8_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i8 @i8_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i8_zeroext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i8 @i8_zeroext_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i8_signext_func_void:
+; GCN: buffer_load_sbyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define signext i8 @i8_signext_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i16_func_void:
+; GCN: buffer_load_ushort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @i16_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i16_zeroext_func_void:
+; GCN: buffer_load_ushort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i16 @i16_zeroext_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i16_signext_func_void:
+; GCN: buffer_load_sshort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define signext i16 @i16_signext_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i32_func_void:
+; GCN: buffer_load_dword v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i32 @i32_func_void() #0 {
+  %val = load i32, i32 addrspace(1)* undef
+  ret i32 %val
+}
+
+; GCN-LABEL: {{^}}i64_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i64 @i64_func_void() #0 {
+  %val = load i64, i64 addrspace(1)* undef
+  ret i64 %val
+}
+
+; GCN-LABEL: {{^}}f32_func_void:
+; GCN: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define float @f32_func_void() #0 {
+  %val = load float, float addrspace(1)* undef
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}f64_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define double @f64_func_void() #0 {
+  %val = load double, double addrspace(1)* undef
+  ret double %val
+}
+
+; GCN-LABEL: {{^}}v2i32_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <2 x i32> @v2i32_func_void() #0 {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  ret <2 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v3i32_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <3 x i32> @v3i32_func_void() #0 {
+  %val = load <3 x i32>, <3 x i32> addrspace(1)* undef
+  ret <3 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v4i32_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <4 x i32> @v4i32_func_void() #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  ret <4 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v5i32_func_void:
+; GCN-DAG: buffer_load_dword v4, off
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <5 x i32> @v5i32_func_void() #0 {
+  %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
+  ret <5 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v8i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <8 x i32> @v8i32_func_void() #0 {
+  %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  ret <8 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v16i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <16 x i32> @v16i32_func_void() #0 {
+  %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  ret <16 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v32i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <32 x i32> @v32i32_func_void() #0 {
+  %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  ret <32 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v2i64_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <2 x i64> @v2i64_func_void() #0 {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* undef
+  ret <2 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v3i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <3 x i64> @v3i64_func_void() #0 {
+  %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
+  ret <3 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v4i64_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN: buffer_load_dwordx4 v[4:7], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <4 x i64> @v4i64_func_void() #0 {
+  %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
+  ret <4 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v5i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <5 x i64> @v5i64_func_void() #0 {
+  %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
+  ret <5 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v8i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <8 x i64> @v8i64_func_void() #0 {
+  %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
+  ret <8 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v16i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <16 x i64> @v16i64_func_void() #0 {
+  %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
+  ret <16 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v2i16_func_void:
+; GFX9: buffer_load_dword v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @v2i16_func_void() #0 {
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  ret <2 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v3i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <3 x i16> @v3i16_func_void() #0 {
+  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
+  ret <3 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v4i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <4 x i16> @v4i16_func_void() #0 {
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
+  ret <4 x i16> %val
+}
+
+; FIXME: Should not scalarize
+; GCN-LABEL: {{^}}v5i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1]
+; GFX9: buffer_load_ushort v4
+; GFX9: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9: v_mov_b32_e32 v2, v1
+; GFX9: v_lshrrev_b32_e32 v3, 16, v0
+; GCN: s_setpc_b64
+define <5 x i16> @v5i16_func_void() #0 {
+  %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
+  ret <5 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v8i16_func_void:
+; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <8 x i16> @v8i16_func_void() #0 {
+  %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  ret <8 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v16i16_func_void:
+; GFX9: buffer_load_dwordx4 v[0:3], off
+; GFX9: buffer_load_dwordx4 v[4:7], off
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <16 x i16> @v16i16_func_void() #0 {
+  %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  ret <16 x i16> %val
+}
+
+; FIXME: Should pack
+; GCN-LABEL: {{^}}v16i8_func_void:
+; GCN-DAG: v12
+; GCN-DAG: v13
+; GCN-DAG: v14
+; GCN-DAG: v15
+define <16 x i8> @v16i8_func_void() #0 {
+  %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  ret <16 x i8> %val
+}
+
+; FIXME: Should pack
+; GCN-LABEL: {{^}}v4i8_func_void:
+; GCN: buffer_load_dword v0
+; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
+; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
+; CI-DAG: v_bfe_u32 v1, v0, 8, 8
+; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0
+; GCN: s_setpc_b64
+define <4  x i8> @v4i8_func_void() #0 {
+  %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(2)* undef
+  %val = load <4  x i8>, <4  x i8> addrspace(1)* %ptr
+  ret <4  x i8> %val
+}
+
+; GCN-LABEL: {{^}}struct_i8_i32_func_void:
+; GCN-DAG: buffer_load_dword v1
+; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define {i8, i32} @struct_i8_i32_func_void() #0 {
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
+  ret { i8, i32 } %val
+}
+
+; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
+; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
+; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}}
+; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}}
+define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
+  %val0 = load volatile i8, i8 addrspace(1)* undef
+  %val1 = load volatile i32, i32 addrspace(1)* undef
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  store i8 %val0, i8* %gep0
+  store i32 %val1, i32* %gep1
+  ret void
+}
+
+; GCN-LABEL: {{^}}v33i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <33 x i32> @v33i32_func_void() #0 {
+  %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
+  ret <33 x i32> %val
+}
+
+; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
+  %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
+  %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
+  ret { <32 x i32>, i32 }%val
+}
+
+; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
+  %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
+  %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
+  ret { i32, <32 x i32> }%val
+}
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/hsa-func.ll
===================================================================
--- test/CodeGen/AMDGPU/hsa-func.ll
+++ test/CodeGen/AMDGPU/hsa-func.ll
@@ -27,7 +27,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 44
+; ELF: Size: 48
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
@@ -41,14 +41,12 @@
 ; HSA: .p2align 2
 ; HSA: {{^}}simple:
 ; HSA-NOT: amd_kernel_code_t
-
-; FIXME: Check this isn't a kernarg load when calling convention implemented.
-; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
-; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000
 ; On VI+ we also need to set MTYPE = 2
-; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
+; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000
 ; Make sure we generate flat store for HSA
 ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 
@@ -56,8 +54,9 @@
 ; HSA: .size   simple, .Lfunc_end0-simple
 ; HSA: ; Function info:
 ; HSA-NOT: COMPUTE_PGM_RSRC2
-define void @simple(i32 addrspace(1)* %out) {
+define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
 entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
Index: test/CodeGen/AMDGPU/inline-asm.ll
===================================================================
--- test/CodeGen/AMDGPU/inline-asm.ll
+++ test/CodeGen/AMDGPU/inline-asm.ll
@@ -191,7 +191,7 @@
 ; CHECK: v_mov_b32_e32 v0, s0
 ; CHECK: v_mov_b32_e32 v1, s1
 ; CHECK: use v[0:1]
-define void @i64_imm_input_phys_vgpr() {
+define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 entry:
   call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
   ret void
Index: test/CodeGen/AMDGPU/subreg_interference.mir
===================================================================
--- test/CodeGen/AMDGPU/subreg_interference.mir
+++ test/CodeGen/AMDGPU/subreg_interference.mir
@@ -1,4 +1,12 @@
 # RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+--- |
+
+  define amdgpu_kernel void @func0() {
+    ret void
+  }
+
+...
+
 ---
 # We should not detect any interference between v0/v1 here and only allocate
 # sgpr0-sgpr3.