Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -563,6 +563,8 @@
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
   }
 }
Index: lib/Target/AMDGPU/AMDGPUCallingConv.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -20,28 +20,83 @@
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>,
 
+  // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
-    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
   ]>>>,
 
   CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
   >>>
 
 ]>;
 
+def RetCC_SI : CallingConv<[
+  CCIfType<[i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+  ]>>,
+
+  // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+  CCIfType<[f32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>
+]>;
+
 // Calling convention for R600
 def CC_R600 : CallingConv<[
   CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -109,6 +109,8 @@
                                SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
+  void AnalyzeReturn(CCState &State,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
 public:
   AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -565,6 +565,12 @@
   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
 }
 
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+  State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
 SDValue AMDGPUTargetLowering::LowerReturn(
                                      SDValue Chain,
                                      CallingConv::ID CallConv,
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -240,4 +240,4 @@
 // Call/Return DAG Nodes
 //===----------------------------------------------------------------------===//
 def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
-    [SDNPHasChain, SDNPOptInGlue]>;
+    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
Index: lib/Target/AMDGPU/SIDefines.h
===================================================================
--- lib/Target/AMDGPU/SIDefines.h
+++ lib/Target/AMDGPU/SIDefines.h
@@ -137,7 +137,7 @@
 #define   C_00B84C_EXCP_EN 
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
-
+#define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
 
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -95,6 +95,13 @@
                                SDLoc DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerReturn(SDValue Chain,
+                      CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+
   MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                       MachineBasicBlock * BB) const override;
   bool enableAggressiveFMAFusion(EVT VT) const override;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -598,18 +598,18 @@
 
     // First check if it's a PS input addr
     if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
-        !Arg.Flags.isByVal()) {
+        !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
-      assert((PSInputNum <= 15) && "Too many PS inputs!");
-
-      if (!Arg.Used) {
+      if (!Arg.Used && !(Info->PSInputAddr & (1 << PSInputNum))) {
         // We can safely skip PS inputs
         Skipped.set(i);
         ++PSInputNum;
         continue;
       }
 
-      Info->PSInputAddr |= 1 << PSInputNum++;
+      Info->PSInputEna |= 1 << PSInputNum;
+      Info->PSInputAddr |= 1 << PSInputNum;
+      ++PSInputNum;
     }
 
     // Second split vertices into their elements
@@ -640,10 +640,14 @@
 
   // At least one interpolation mode must be enabled or else the GPU will hang.
   if (Info->getShaderType() == ShaderType::PIXEL &&
-      (Info->PSInputAddr & 0x7F) == 0) {
-    Info->PSInputAddr |= 1;
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
+      (Info->PSInputEna & 0x7F) == 0) {
+    Info->PSInputEna |= 1;
+
+    if (!(Info->PSInputAddr & 0x1)) {
+      Info->PSInputAddr |= 1;
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+    }
   }
 
   if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -872,6 +876,92 @@
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
+SDValue SITargetLowering::LowerReturn(SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      SDLoc DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  Info->ReturnsVoid = Outs.size() == 0;
+
+  SmallVector<ISD::OutputArg, 48> Splits;
+  SmallVector<SDValue, 48> SplitVals;
+
+  // Split vectors into their elements.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+
+    if (Out.VT.isVector()) {
+      MVT VT = Out.VT.getVectorElementType();
+      ISD::OutputArg NewOut = Out;
+      NewOut.Flags.setSplit();
+      NewOut.VT = VT;
+
+      // We want the original number of vector elements here, e.g.
+      // three or five, not four or eight.
+      unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+                                   DAG.getConstant(j, DL, MVT::i32));
+        SplitVals.push_back(Elem);
+        Splits.push_back(NewOut);
+        NewOut.PartOffset += NewOut.VT.getStoreSize();
+      }
+    } else {
+      SplitVals.push_back(OutVals[i]);
+      Splits.push_back(Out);
+    }
+  }
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 48> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze outgoing return values.
+  AnalyzeReturn(CCInfo, Splits);
+
+  SDValue Flag;
+  SmallVector<SDValue, 48> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = SplitVals[realRVLocIdx];
+
+    // Copied from other backends.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
@@ -2292,7 +2382,9 @@
 
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isFrameIndexOp(Node->getOperand(i))) {
+    if (!isFrameIndexOp(Node->getOperand(i)) &&
+        (Node->getOpcode() != ISD::MERGE_VALUES ||
+         !isa<ConstantSDNode>(Node->getOperand(i)))) {
       Ops.push_back(Node->getOperand(i));
       continue;
     }
Index: lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaits.cpp
+++ lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -84,6 +84,9 @@
 
   bool LastInstWritesM0;
 
+  /// \brief Whether the machine function returns void
+  bool ReturnsVoid;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -322,7 +325,9 @@
                                const Counters &Required) {
 
   // End of program? No need to wait on anything
-  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+  // A function not returning void needs to wait, because other bytecode will
+  // be appended after it and we don't know what it will be.
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
     return false;
 
   // Figure out if the async instructions execute in order
@@ -465,6 +470,7 @@
   LastIssued = ZeroCounts;
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
+  ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->ReturnsVoid;
 
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -488,6 +494,15 @@
 
     // Wait for everything at the end of the MBB
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+
+    // Functions returning something shouldn't contain S_ENDPGM, because other
+    // bytecode will be appended after it.
+    if (!ReturnsVoid) {
+      MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+      assert(I != MBB.end());
+      if (I->getOpcode() == AMDGPU::S_ENDPGM)
+        I->eraseFromParent();
+    }
   }
 
   return Changes;
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -60,11 +60,13 @@
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
+  unsigned PSInputEna;
   unsigned PSInputAddr;
   std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
   unsigned NumSystemSGPRs;
+  bool ReturnsVoid;
 
 private:
   bool HasSpilledSGPRs;
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -47,9 +47,11 @@
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     LDSWaveSpillSize(0),
+    PSInputEna(0),
     PSInputAddr(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
+    ReturnsVoid(true),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
     PrivateSegmentBuffer(false),
@@ -72,6 +74,8 @@
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
   const Function *F = MF.getFunction();
 
+  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
   if (getShaderType() == ShaderType::COMPUTE)
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -45,6 +45,8 @@
 bool isReadOnlySegment(const GlobalValue *GV);
 
 unsigned getShaderType(const Function &F);
+unsigned getInitialPSInputAddr(const Function &F);
+
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -106,20 +106,27 @@
   return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
 }
 
-static const char ShaderTypeAttribute[] = "ShaderType";
-
-unsigned getShaderType(const Function &F) {
-  Attribute A = F.getFnAttribute(ShaderTypeAttribute);
-  unsigned ShaderType = ShaderType::COMPUTE;
+static unsigned getIntegerAttribute(const Function &F, const char *Name,
+                                    unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  unsigned Result = Default;
 
   if (A.isStringAttribute()) {
     StringRef Str = A.getValueAsString();
-    if (Str.getAsInteger(0, ShaderType)) {
+    if (Str.getAsInteger(0, Result)) {
       LLVMContext &Ctx = F.getContext();
       Ctx.emitError("can't parse shader type");
     }
   }
-  return ShaderType;
+  return Result;
+}
+
+unsigned getShaderType(const Function &F) {
+  return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+  return getIntegerAttribute(F, "InitialPSInputAddr", 0);
 }
 
 bool isSI(const MCSubtargetInfo &STI) {
Index: test/CodeGen/AMDGPU/ret.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/ret.ll
@@ -0,0 +1,200 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+attributes #0 = { "ShaderType"="1" }
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; GCN-LABEL: {{^}}vgpr:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
+; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  %x = fadd float %3, 1.0
+  %a = insertvalue {float, float} undef, float %x, 0
+  %b = insertvalue {float, float} %a, float %3, 1
+  ret {float, float} %b
+}
+
+; GCN-LABEL: {{^}}vgpr_literal:
+; GCN: v_mov_b32_e32 v4, v0
+; GCN-DAG: v_mov_b32_e32 v0, 1.0
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN-DAG: v_mov_b32_e32 v3, -1.0
+; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
+}
+
+
+; GCN-LABEL: {{^}}vgpr_ps_addr0:
+; GCN-NOT: v_mov_b32_e32 v0
+; GCN-NOT: v_mov_b32_e32 v1
+; GCN-NOT: v_mov_b32_e32 v2
+; GCN: v_mov_b32_e32 v3, v4
+; GCN: v_mov_b32_e32 v4, v6
+; GCN-NOT: s_endpgm
+attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" }
+define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN-LABEL: {{^}}vgpr_ps_addr1:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mov_b32_e32 v2, v4
+; GCN-DAG: v_mov_b32_e32 v3, v6
+; GCN-DAG: v_mov_b32_e32 v4, v8
+; GCN-NOT: s_endpgm
+attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" }
+define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN-LABEL: {{^}}vgpr_ps_addr119:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mov_b32_e32 v2, v6
+; GCN: v_mov_b32_e32 v3, v8
+; GCN: v_mov_b32_e32 v4, v12
+; GCN-NOT: s_endpgm
+attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" }
+define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN-LABEL: {{^}}vgpr_ps_addr418:
+; GCN-NOT: v_mov_b32_e32 v0
+; GCN-NOT: v_mov_b32_e32 v1
+; GCN-NOT: v_mov_b32_e32 v2
+; GCN: v_mov_b32_e32 v3, v4
+; GCN: v_mov_b32_e32 v4, v8
+; GCN-NOT: s_endpgm
+attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" }
+define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
+  %i0 = extractelement <2 x i32> %4, i32 0
+  %i1 = extractelement <2 x i32> %4, i32 1
+  %i2 = extractelement <2 x i32> %7, i32 0
+  %i3 = extractelement <2 x i32> %8, i32 0
+  %f0 = bitcast i32 %i0 to float
+  %f1 = bitcast i32 %i1 to float
+  %f2 = bitcast i32 %i2 to float
+  %f3 = bitcast i32 %i3 to float
+  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
+  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
+  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
+  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
+  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
+  ret {float, float, float, float, float} %r4
+}
+
+
+; GCN-LABEL: {{^}}sgpr:
+; GCN: s_add_i32 s0, s3, 2
+; GCN: s_mov_b32 s2, s3
+; GCN-NOT: s_endpgm
+define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  %x = add i32 %2, 2
+  %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
+  %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
+  %c = insertvalue {i32, i32, i32} %a, i32 %2, 2
+  ret {i32, i32, i32} %c
+}
+
+
+; GCN-LABEL: {{^}}sgpr_literal:
+; GCN: s_mov_b32 s0, 5
+; XGCN-NOT: s_mov_b32 s0, s0
+; GCN-DAG: s_mov_b32 s1, 6
+; GCN-DAG: s_mov_b32 s2, 7
+; GCN-DAG: s_mov_b32 s3, 8
+; GCN-NOT: s_endpgm
+define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  %x = add i32 %2, 2
+  ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
+}
+
+
+; GCN-LABEL: {{^}}both:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
+; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
+; GCN-DAG: s_add_i32 s0, s3, 2
+; GCN-DAG: s_mov_b32 s1, s2
+; GCN: s_mov_b32 s2, s3
+; GCN: s_waitcnt expcnt(0)
+; GCN-NOT: s_endpgm
+define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  %v = fadd float %3, 1.0
+  %s = add i32 %2, 2
+  %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0
+  %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1
+  %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2
+  %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3
+  %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4
+  ret {float, i32, float, i32, i32} %a4
+}
+
+
+; GCN-LABEL: {{^}}structure_literal:
+; GCN: v_mov_b32_e32 v3, v0
+; GCN-DAG: v_mov_b32_e32 v0, 1.0
+; GCN-DAG: s_mov_b32 s0, 2
+; GCN-DAG: s_mov_b32 s1, 3
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
+define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
+}