Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -345,8 +345,8 @@
   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   uint64_t CodeSize = 0;
-  unsigned MaxSGPR = 0;
-  unsigned MaxVGPR = 0;
+  unsigned MaxSGPR = MFI->getNumUserSGPRs() - 1;
+  unsigned MaxVGPR = MFI->getNumUserVGPRs() - 1;
   bool VCCUsed = false;
   bool FlatUsed = false;
   const SIRegisterInfo *RI =
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -738,8 +738,8 @@
       ((Info->getPSInputAddr() & 0x7F) == 0 ||
        ((Info->getPSInputAddr() & 0xF) == 0 &&
 	Info->isPSInputAllocated(11)))) {
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
+    CCInfo.AllocateReg(Info->addArgUserReg(*TRI, AMDGPU::VGPR0, 4));
+    CCInfo.AllocateReg(Info->addArgUserReg(*TRI, AMDGPU::VGPR1, 4));
     Info->markPSInputAllocated(0);
     Info->PSInputEna |= 1;
   }
@@ -830,50 +830,59 @@
       Info->ABIArgOffset = Offset + MemVT.getStoreSize();
       continue;
     }
+
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
+    // Currently only in register types are 32-bit or 64-bit. Only the first
+    // register in the pair is returned by getLocReg.
+    bool IsSGPRArg = Arg.Flags.isInReg() || Arg.Flags.isByVal();
     unsigned Reg = VA.getLocReg();
-
-    if (VT == MVT::i64) {
-      // For now assume it is a pointer
-      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
-                                     &AMDGPU::SReg_64RegClass);
-      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
-      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-      InVals.push_back(Copy);
-      continue;
+    unsigned RegSize = VT.getStoreSize();
+    const TargetRegisterClass *RC = IsSGPRArg ?
+      TRI->getSGPRSizeClass(RegSize) : TRI->getVGPRSizeClass(RegSize);
+    if (RegSize == 8) {
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
+      assert(Reg != AMDGPU::NoRegister);
     }
 
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    if (!Arg.VT.isVector()) {
+      unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize);
 
-    Reg = MF.addLiveIn(Reg, RC);
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      CCInfo.AllocateReg(NewReg);
+      Reg = MF.addLiveIn(NewReg, RC);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Val);
+      continue;
+    }
 
-    if (Arg.VT.isVector()) {
+    // Build a vector from the registers
+    Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+    unsigned NumElements = ParamType->getVectorNumElements();
 
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
+    SmallVector<SDValue, 4> Regs;
+    unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize);
 
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
+    CCInfo.AllocateReg(NewReg);
+    Reg = MF.addLiveIn(NewReg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+    Regs.push_back(Val);
 
-        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-        Regs.push_back(Copy);
-      }
 
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
+    for (unsigned j = 1; j != NumElements; ++j) {
+      Reg = ArgLocs[ArgIdx++].getLocReg();
+      unsigned NewReg = Info->addArgUserReg(*TRI, Reg, RegSize);
+      CCInfo.AllocateReg(NewReg);
+      Reg = MF.addLiveIn(NewReg, RC);
 
-      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
-      continue;
+      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      Regs.push_back(Copy);
     }
 
-    InVals.push_back(Val);
+    // Fill up the missing vector elements
+    NumElements = Arg.VT.getVectorNumElements() - NumElements;
+    Regs.append(NumElements, DAG.getUNDEF(VT));
+
+    InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
   }
 
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -73,6 +73,7 @@
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
   unsigned NumSystemSGPRs;
+  unsigned NumUserVGPRs;
 
 private:
   bool HasSpilledSGPRs;
@@ -112,6 +113,10 @@
     return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
   }
 
+  MCPhysReg getNextUserVGPR() const {
+    return AMDGPU::VGPR0 + NumUserVGPRs;
+  }
+
 public:
   struct SpilledReg {
     unsigned VGPR;
@@ -138,6 +143,9 @@
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
   unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
 
+  unsigned addArgUserReg(const SIRegisterInfo &TRI,
+                         unsigned CurReg, unsigned Size);
+
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
     WorkGroupIDXSystemSGPR = getNextSystemSGPR();
@@ -249,6 +257,10 @@
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
+  unsigned getNumUserVGPRs() const {
+    return NumUserVGPRs;
+  }
+
   unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
     return PrivateSegmentWaveByteOffsetSystemSGPR;
   }
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -54,6 +54,7 @@
     PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
+    NumUserVGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
     HasNonSpillStackObjects(false),
@@ -102,8 +103,18 @@
 
   // X, XY, and XYZ are the only supported combinations, so make sure Y is
   // enabled if Z is.
-  if (WorkItemIDZ)
+  if (WorkItemIDZ) {
     WorkItemIDY = true;
+  }
+
+  if (WorkItemIDX)
+    ++NumUserVGPRs;
+
+  if (WorkItemIDY)
+    ++NumUserVGPRs;
+
+  if (WorkItemIDZ)
+    ++NumUserVGPRs;
 
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo->hasStackObjects();
@@ -174,6 +185,42 @@
   return FlatScratchInitUserSGPR;
 }
 
+unsigned SIMachineFunctionInfo::addArgUserReg(const SIRegisterInfo &TRI,
+                                              unsigned CurReg,
+                                              unsigned Size) {
+  const TargetRegisterClass *RC = TRI.getPhysRegClass(CurReg);
+
+  // VGPRs have no alignment restrictions
+  if (TRI.hasVGPRs(RC)) {
+    unsigned Reg = getNextUserVGPR();
+    NumUserVGPRs += Size / 4;
+    return Reg;
+  }
+
+  // SGPRs have alignment restrictions.
+  if (Size == 4) {
+    unsigned Reg = getNextUserSGPR();
+    NumUserSGPRs += 1;
+    return Reg;
+  }
+
+  assert(Size == 8 &&
+         "user sgpr calling convention only has 4 or 8 byte types");
+
+  unsigned FirstReg = TRI.getSubReg(CurReg, AMDGPU::sub0);
+
+  // Skip over padding register. We assume the register passed in is correctly
+  // aligned.
+  unsigned Reg = getNextUserSGPR();
+  while (Reg != FirstReg) {
+    ++Reg;
+    ++NumUserSGPRs;
+  }
+
+  NumUserSGPRs += 2;
+  return CurReg;
+}
+
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -117,11 +117,18 @@
 
   /// \returns A VGPR reg class with the same width as \p SRC
   const TargetRegisterClass *getEquivalentVGPRClass(
-                                          const TargetRegisterClass *SRC) const;
+    const TargetRegisterClass *SRC) const {
+    return getVGPRSizeClass(SRC->getSize());
+  }
 
   /// \returns A SGPR reg class with the same width as \p SRC
   const TargetRegisterClass *getEquivalentSGPRClass(
-                                           const TargetRegisterClass *VRC) const;
+    const TargetRegisterClass *VRC) const {
+    return getSGPRSizeClass(VRC->getSize());
+  }
+
+  static const TargetRegisterClass *getVGPRSizeClass(unsigned Size);
+  static const TargetRegisterClass *getSGPRSizeClass(unsigned Size);
 
   /// \returns The register class that is used for a sub-register of \p RC for
   /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -699,9 +699,8 @@
   }
 }
 
-const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
-                                         const TargetRegisterClass *SRC) const {
-  switch (SRC->getSize()) {
+const TargetRegisterClass *SIRegisterInfo::getVGPRSizeClass(unsigned Size) {
+  switch (Size) {
   case 4:
     return &AMDGPU::VGPR_32RegClass;
   case 8:
@@ -719,9 +718,9 @@
   }
 }
 
-const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
-                                         const TargetRegisterClass *VRC) const {
-  switch (VRC->getSize()) {
+
+const TargetRegisterClass *SIRegisterInfo::getSGPRSizeClass(unsigned Size) {
+  switch (Size) {
   case 4:
     return &AMDGPU::SGPR_32RegClass;
   case 8:
Index: test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
+++ test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -1,10 +1,93 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
+; GCN-LABEL: {{^}}unused_ptr_0:
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s2
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s3
+; GCN: NumSgprs: 8
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4
+define amdgpu_vs void @unused_ptr_0(i32 addrspace(2)* inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 {
+  store volatile i32 addrspace(2)* %arg1, i32 addrspace(2)* addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}unused_ptr_1:
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1
+; GCN: NumSgprs: 8
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4
+define amdgpu_vs void @unused_ptr_1(i32 addrspace(2)* inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 {
+  store volatile i32 addrspace(2)* %arg0, i32 addrspace(2)* addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}unused_i32_ptr_0:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, s0
+; GCN: NumSgprs: 8
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4
+define amdgpu_vs void @unused_i32_ptr_0(i32 inreg %arg0, i32 addrspace(2)* inreg %arg1) #0 {
+  store volatile i32 %arg0, i32 addrspace(1)* null
+  ret void
+}
+
+; XGCN-LABEL: {{^}}f64_input:
+; XGCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0
+; XGCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1
+; XGCN: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+;define amdgpu_vs void @f64_input(double inreg %arg0) #0 {
+;  store volatile double %arg0, double addrspace(1)* null
+;  ret void
+;}
 
-; GCN-LABEL: {{^}}shader_cc:
+; GCN-LABEL: {{^}}unused_ptr_i32_0:
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s0
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1
+; GCN: NumSgprs: 8
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 3
+define amdgpu_vs void @unused_ptr_i32_0(i32 addrspace(2)* inreg %arg0, i32 inreg %arg1) #0 {
+  store volatile i32 addrspace(2)* %arg0, i32 addrspace(2)* addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}unused_i32_v4i32_0:
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s1
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s2
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s3
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s4
+; GCN: NumSgprs: 5
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 5
+define amdgpu_vs void @unused_i32_v4i32_0(i32 inreg %arg0, <4 x i32> inreg %arg1) #0 {
+  store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}shader_cc_0:
 ; GCN: v_add_i32_e32 v0, vcc, s8, v0
-define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+; GCN: NumSgprs: 11
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 9
+define amdgpu_cs float @shader_cc_0(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) #0 {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
+
+; GCN-LABEL: {{^}}shader_cc_1:
+; GCN: v_add_i32_e32 v0, vcc, s6, v0
+; GCN: NumSgprs: 9
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 7
+define amdgpu_cs float @shader_cc_1(<3 x i32> inreg, <3 x i32> inreg, i32 inreg %w, float %v) #0 {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
+
+; GCN-LABEL: {{^}}shader_cc_2:
+; GCN: v_add_i32_e32 v0, vcc, s6, v0
+; GCN: NumSgprs: 9
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 7
+define amdgpu_cs float @shader_cc_2(<3 x i32> inreg, i64 inreg, i32 inreg %w, float %v) #0 {
   %vi = bitcast float %v to i32
   %x = add i32 %vi, %w
   %xf = bitcast i32 %x to float
@@ -13,9 +96,13 @@
 
 ; GCN-LABEL: {{^}}kernel_cc:
 ; GCN: s_endpgm
-define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+; GCN: NumSgprs: 2
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 2
+define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) #0 {
   %vi = bitcast float %v to i32
   %x = add i32 %vi, %w
   %xf = bitcast i32 %x to float
   ret float %xf
 }
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -88,11 +88,12 @@
 ;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <4 x float> %arg3, i32 %arg4) {
 main_body:
-  call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
-  call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %arg2 = load volatile <8 x i32>, <8 x i32> addrspace(2)* undef
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false)
+  %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false)
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
Index: test/CodeGen/AMDGPU/register-count-comments.ll
===================================================================
--- test/CodeGen/AMDGPU/register-count-comments.ll
+++ test/CodeGen/AMDGPU/register-count-comments.ll
@@ -26,3 +26,19 @@
   store i32 %x, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}one_vgpr_used_3_enabled:
+; SI: NumVgprs: 3
+define void @one_vgpr_used_3_enabled(i32 addrspace(1)* %out) nounwind {
+  %x = call i32 @llvm.amdgcn.workitem.id.x()
+  %y = call i32 @llvm.amdgcn.workitem.id.y()
+  %z = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %x, i32 addrspace(1)* %out, align 4
+  store volatile i32 %y, i32 addrspace(1)* %out, align 4
+  store volatile i32 %z, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.z() nounwind readnone
Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
===================================================================
--- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -11,18 +11,20 @@
 
 ; GCN-LABEL: {{^}}main:
 
+; GCN-DAG: s_mov_b32 s16, s12
 ; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-DAG: s_mov_b32 s14, -1
 ; SI-DAG: s_mov_b32 s15, 0x98f000
 ; VI-DAG: s_mov_b32 s15, 0x980000
 
-; s12 is offset user SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
+; s16 is offset user SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 16-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 16-byte Folded Reload
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
+; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 12
 
 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) {
 bb: