Index: include/llvm/IR/CallingConv.h
===================================================================
--- include/llvm/IR/CallingConv.h
+++ include/llvm/IR/CallingConv.h
@@ -178,6 +178,9 @@
     /// which have an "optimized" convention to preserve registers.
     AVR_BUILTIN = 86,
 
+    /// Calling convention used for Mesa shaders.
+    AMDGPU_SHADER = 87,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
Index: lib/AsmParser/LLLexer.cpp
===================================================================
--- lib/AsmParser/LLLexer.cpp
+++ lib/AsmParser/LLLexer.cpp
@@ -597,6 +597,7 @@
   KEYWORD(hhvmcc);
   KEYWORD(hhvm_ccc);
   KEYWORD(cxx_fast_tlscc);
+  KEYWORD(amdgpu_shader);
 
   KEYWORD(cc);
   KEYWORD(c);
Index: lib/AsmParser/LLParser.cpp
===================================================================
--- lib/AsmParser/LLParser.cpp
+++ lib/AsmParser/LLParser.cpp
@@ -1573,6 +1573,7 @@
 ///   ::= 'hhvmcc'
 ///   ::= 'hhvm_ccc'
 ///   ::= 'cxx_fast_tlscc'
+///   ::= 'amdgpu_shader'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -1607,6 +1608,7 @@
   case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
   case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
+  case lltok::kw_amdgpu_shader:   CC = CallingConv::AMDGPU_SHADER; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
Index: lib/AsmParser/LLToken.h
===================================================================
--- lib/AsmParser/LLToken.h
+++ lib/AsmParser/LLToken.h
@@ -104,6 +104,7 @@
     kw_x86_intrcc,
     kw_hhvmcc, kw_hhvm_ccc,
     kw_cxx_fast_tlscc,
+    kw_amdgpu_shader,
 
     // Attributes:
     kw_attributes,
Index: lib/IR/AsmWriter.cpp
===================================================================
--- lib/IR/AsmWriter.cpp
+++ lib/IR/AsmWriter.cpp
@@ -318,6 +318,7 @@
   case CallingConv::X86_INTR:      Out << "x86_intrcc"; break;
   case CallingConv::HHVM:          Out << "hhvmcc"; break;
   case CallingConv::HHVM_C:        Out << "hhvm_ccc"; break;
+  case CallingConv::AMDGPU_SHADER: Out << "amdgpu_shader"; break;
   }
 }
 
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -577,7 +577,7 @@
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MFI)) {
+    if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
     }
Index: lib/Target/AMDGPU/AMDGPUCallingConv.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -117,14 +117,12 @@
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
-         "->getShaderType() == ShaderType::COMPUTE",
+        "State.getCallingConv() != CallingConv::AMDGPU_SHADER",
        CCDelegateTo<CC_AMDGPU_Kernel>>,
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() < "
           "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
-          "->getShaderType() == ShaderType::COMPUTE",
+         "State.getCallingConv() != CallingConv::AMDGPU_SHADER",
         CCDelegateTo<CC_AMDGPU_Kernel>>,
    CCIf<"static_cast<const AMDGPUSubtarget&>"
          "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -305,7 +305,7 @@
   bool isAmdHsaOS() const {
     return TargetTriple.getOS() == Triple::AMDHSA;
   }
-  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+  bool isVGPRSpillingEnabled(const Function& F) const;
 
   bool isXNACKEnabled() const {
     return EnableXNACK;
Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -136,9 +136,8 @@
   return AMDGPU::getIsaVersion(getFeatureBits());
 }
 
-bool AMDGPUSubtarget::isVGPRSpillingEnabled(
-                                       const SIMachineFunctionInfo *MFI) const {
-  return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling;
+bool AMDGPUSubtarget::isVGPRSpillingEnabled(const Function& F) const {
+  return F.getCallingConv() != CallingConv::AMDGPU_SHADER || EnableVGPRSpilling;
 }
 
 void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -265,10 +265,9 @@
 
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
-  unsigned ShaderType = AMDGPU::getShaderType(*F);
 
   // Arguments to compute shaders are never a source of divergence.
-  if (ShaderType == ShaderType::COMPUTE)
+  if (F->getCallingConv() != CallingConv::AMDGPU_SHADER)
     return true;
 
   // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
Index: lib/Target/AMDGPU/R600ISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/R600ISelLowering.cpp
+++ lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1759,7 +1759,7 @@
       MemVT = MemVT.getVectorElementType();
     }
 
-    if (MFI->getShaderType() != ShaderType::COMPUTE) {
+    if (CallConv == CallingConv::AMDGPU_SHADER) {
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -596,7 +596,8 @@
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
-  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+  if (Subtarget->isAmdHsaOS() && (Info->getShaderType() != ShaderType::COMPUTE ||
+                                  CallConv == CallingConv::AMDGPU_SHADER)) {
     const Function *Fn = MF.getFunction();
     DiagnosticInfoUnsupported NoGraphicsHSA(
         *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
@@ -604,8 +605,6 @@
     return SDValue();
   }
 
-  // FIXME: We currently assume all calling conventions are kernels.
-
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
 
@@ -631,7 +630,8 @@
     }
 
     // Second split vertices into their elements
-    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
+    if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_SHADER &&
+        Arg.VT.isVector()) {
       ISD::InputArg NewArg = Arg;
       NewArg.Flags.setSplit();
       NewArg.VT = Arg.VT.getVectorElementType();
@@ -647,7 +647,7 @@
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
 
-    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
+    } else if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_SHADER ) {
       Splits.push_back(Arg);
     }
   }
@@ -678,7 +678,7 @@
     Info->PSInputEna |= 1;
   }
 
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
+  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_SHADER ) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
   }
@@ -922,7 +922,7 @@
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (Info->getShaderType() == ShaderType::COMPUTE)
+  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_SHADER)
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
 
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -596,7 +596,7 @@
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
@@ -682,7 +682,7 @@
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MFI)) {
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
@@ -728,7 +728,7 @@
       return TIDReg;
 
 
-    if (MFI->getShaderType() == ShaderType::COMPUTE &&
+    if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_SHADER  &&
         WorkGroupSize > WavefrontSize) {
 
       unsigned TIDIGXReg
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -80,7 +80,7 @@
 
   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 
-  if (getShaderType() == ShaderType::COMPUTE)
+  if (F->getCallingConv() != CallingConv::AMDGPU_SHADER)
     KernargSegmentPtr = true;
 
   if (F->hasFnAttribute("amdgpu-work-group-id-y"))
@@ -100,7 +100,7 @@
   if (WorkItemIDZ)
     WorkItemIDY = true;
 
-  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo->hasStackObjects();
 
   if (HasStackObjects || MaySpill)
Index: lib/Target/AMDGPU/SITypeRewriter.cpp
===================================================================
--- lib/Target/AMDGPU/SITypeRewriter.cpp
+++ lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -62,7 +62,7 @@
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
+  if (F.getCallingConv() != CallingConv::AMDGPU_SHADER)
     return false;
 
   visit(F);
Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
===================================================================
--- test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
+++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll
@@ -9,7 +9,7 @@
 ; CHECK: DIVERGENT:  float %arg5
 ; CHECK: DIVERGENT:  i32 %arg6
 
-define void @main([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
+define cc 87 void @main([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
   ret void
 }
 
Index: test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}shader_cc:
+; GCN: v_add_i32_e32 v0, vcc, s8, v0
+define amdgpu_shader float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
+
+; GCN-LABEL: {{^}}kernel_cc:
+; GCN: s_endpgm
+define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  %vi = bitcast float %v to i32
+  %x = add i32 %vi, %w
+  %xf = bitcast i32 %x to float
+  ret float %xf
+}
Index: test/CodeGen/AMDGPU/big_alu.ll
===================================================================
--- test/CodeGen/AMDGPU/big_alu.ll
+++ test/CodeGen/AMDGPU/big_alu.ll
@@ -3,7 +3,7 @@
 ; This test ensures that R600 backend can handle ifcvt properly
 ; and do not generate ALU clauses with more than 128 instructions.
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #1 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #1 {
 main_body:
   %0 = extractelement <4 x float> %reg0, i32 0
   %1 = extractelement <4 x float> %reg0, i32 1
Index: test/CodeGen/AMDGPU/bitcast.ll
===================================================================
--- test/CodeGen/AMDGPU/bitcast.ll
+++ test/CodeGen/AMDGPU/bitcast.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
 ; SI: s_endpgm
-define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_shader void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
Index: test/CodeGen/AMDGPU/cayman-loop-bug.ll
===================================================================
--- test/CodeGen/AMDGPU/cayman-loop-bug.ll
+++ test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -8,7 +8,7 @@
 ; CHECK-NOT: ALU_PUSH_BEFORE
 ; CHECK: END_LOOP
 ; CHECK: END_LOOP
-define void @main (<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @main (<4 x float> inreg %reg0) #0 {
 entry:
   br label %outer_loop
 outer_loop:
@@ -29,4 +29,4 @@
   ret void
 }
 
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
Index: test/CodeGen/AMDGPU/complex-folding.ll
===================================================================
--- test/CodeGen/AMDGPU/complex-folding.ll
+++ test/CodeGen/AMDGPU/complex-folding.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}main:
 ; CHECK-NOT: MOV
-define void @main(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0) #0 {
 entry:
   %0 = extractelement <4 x float> %reg0, i32 0
   %1 = call float @fabs(float %0)
@@ -16,4 +16,4 @@
 declare float @fabs(float ) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
Index: test/CodeGen/AMDGPU/floor.ll
===================================================================
--- test/CodeGen/AMDGPU/floor.ll
+++ test/CodeGen/AMDGPU/floor.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
 ; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
    %vec = insertelement <4 x float> undef, float %r1, i32 0
Index: test/CodeGen/AMDGPU/fmad.ll
===================================================================
--- test/CodeGen/AMDGPU/fmad.ll
+++ test/CodeGen/AMDGPU/fmad.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = extractelement <4 x float> %reg0, i32 2
@@ -16,4 +16,4 @@
 declare float @fabs(float ) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
Index: test/CodeGen/AMDGPU/fmax.ll
===================================================================
--- test/CodeGen/AMDGPU/fmax.ll
+++ test/CodeGen/AMDGPU/fmax.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp oge float %r0, %r1
@@ -14,4 +14,4 @@
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
Index: test/CodeGen/AMDGPU/fmin.ll
===================================================================
--- test/CodeGen/AMDGPU/fmin.ll
+++ test/CodeGen/AMDGPU/fmin.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -14,4 +14,4 @@
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { "ShaderType"="0" }
\ No newline at end of file
+attributes #0 = { "ShaderType"="0" }
Index: test/CodeGen/AMDGPU/input-mods.ll
===================================================================
--- test/CodeGen/AMDGPU/input-mods.ll
+++ test/CodeGen/AMDGPU/input-mods.ll
@@ -9,7 +9,7 @@
 ;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
 ;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.fabs.f32(float %r0)
    %r2 = fsub float -0.000000e+00, %r1
Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll
===================================================================
--- test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+define amdgpu_shader void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
@@ -33,7 +33,7 @@
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+define amdgpu_shader void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
Index: test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -5,7 +5,7 @@
 ; SI-NOT: v_cmpx_le_f32
 ; SI: s_mov_b64 exec, 0
 
-define void @kill_gs_const() #0 {
+define amdgpu_shader void @kill_gs_const() #0 {
 main_body:
   %0 = icmp ule i32 0, 3
   %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
@@ -21,7 +21,7 @@
 ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
+define amdgpu_shader void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
   call void @llvm.AMDGPU.kill(float %14)
Index: test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -10,7 +10,7 @@
 ;GCN: v_interp_p1_f32
 ;GCN: v_interp_p2_f32
 
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_shader void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
 main_body:
   %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
   %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
@@ -25,7 +25,7 @@
 ; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
 ; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
 
-define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+define amdgpu_shader void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
 main_body:
   %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
   %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
Index: test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
+++ test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v1:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd
-define void @v1(i32 %a1) #0 {
+define amdgpu_shader void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -16,7 +16,7 @@
 
 ; CHECK-LABEL: {{^}}v2:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb
-define void @v2(i32 %a1) #0 {
+define amdgpu_shader void @v2(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -29,7 +29,7 @@
 
 ; CHECK-LABEL: {{^}}v3:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe
-define void @v3(i32 %a1) #0 {
+define amdgpu_shader void @v3(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -42,7 +42,7 @@
 
 ; CHECK-LABEL: {{^}}v4:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7
-define void @v4(i32 %a1) #0 {
+define amdgpu_shader void @v4(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -55,7 +55,7 @@
 
 ; CHECK-LABEL: {{^}}v5:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa
-define void @v5(i32 %a1) #0 {
+define amdgpu_shader void @v5(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -67,7 +67,7 @@
 
 ; CHECK-LABEL: {{^}}v6:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6
-define void @v6(i32 %a1) #0 {
+define amdgpu_shader void @v6(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@@ -79,7 +79,7 @@
 
 ; CHECK-LABEL: {{^}}v7:
 ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9
-define void @v7(i32 %a1) #0 {
+define amdgpu_shader void @v7(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
   %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
Index: test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -14,7 +14,7 @@
 ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
 
-define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
+define amdgpu_shader void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
 main_body:
   %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
Index: test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
+++ test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@@ -7,7 +7,7 @@
 ; BOTH-NEXT: s_sendmsg Gs_done(nop)
 ; BOTH-NEXT: s_endpgm
 
-define void @main(i32 inreg %a) #0 {
+define amdgpu_shader void @main(i32 inreg %a) #0 {
 main_body:
   call void @llvm.SI.sendmsg(i32 3, i32 %a)
   ret void
Index: test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -3,7 +3,7 @@
 
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test1(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_shader void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
@@ -13,7 +13,7 @@
 
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test2(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_shader void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
         i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
@@ -23,7 +23,7 @@
 
 ;CHECK-LABEL: {{^}}test3:
 ;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test3(i32 %a1, i32 %vaddr) #0 {
+define amdgpu_shader void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
         i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
@@ -33,7 +33,7 @@
 
 ;CHECK-LABEL: {{^}}test4:
 ;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
-define void @test4(i32 %vdata, i32 %vaddr) #0 {
+define amdgpu_shader void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
Index: test/CodeGen/AMDGPU/llvm.SI.tid.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.tid.ll
+++ test/CodeGen/AMDGPU/llvm.SI.tid.ll
@@ -5,7 +5,7 @@
 ;SI: v_mbcnt_hi_u32_b32_e32
 ;VI: v_mbcnt_hi_u32_b32_e64
 
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_shader void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
   %4 = call i32 @llvm.SI.tid()
   %5 = bitcast i32 %4 to float
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -16,7 +16,7 @@
 ;CHECK: buffer_atomic_swap v0, s[0:3], [[SOFS]] offset:1 glc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_swap v0, s[0:3], 0{{$}}
-define float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) #0 {
+define amdgpu_shader float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) #0 {
 main_body:
   %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
   %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
@@ -48,7 +48,7 @@
 ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
-define float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
+define amdgpu_shader float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
 main_body:
   %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
   %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
@@ -80,7 +80,7 @@
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
 ;CHECK-DAG: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[SOFS]] offset:1 glc
-define float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 {
+define amdgpu_shader float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 {
 main_body:
   %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
   %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], 0 glc
 ;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], 0 slc
 ;CHECK: s_waitcnt
-define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
+define amdgpu_shader {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -20,7 +20,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
 ;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0 offset:42
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
+define amdgpu_shader <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret <4 x float> %data
@@ -33,7 +33,7 @@
 ;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
 ;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS2]] offset:1
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
+define amdgpu_shader <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
 main_body:
   %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
   %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
@@ -49,7 +49,7 @@
 ;CHECK-NOT: s_mov
 ;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:81
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 {
+define amdgpu_shader <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 {
 main_body:
   %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
   %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
@@ -60,7 +60,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_idx:
 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
   ret <4 x float> %data
@@ -69,7 +69,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs:
 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -78,7 +78,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
 ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
 main_body:
   %ofs = add i32 %1, 58
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
@@ -88,7 +88,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_both:
 ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
   ret <4 x float> %data
@@ -98,7 +98,7 @@
 ;CHECK: v_mov_b32_e32 v2, v0
 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -107,7 +107,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x:
 ;CHECK: buffer_load_format_x v0, s[0:3], 0
 ;CHECK: s_waitcnt
-define float @buffer_load_x(<4 x i32> inreg %rsrc) #0 {
+define amdgpu_shader float @buffer_load_x(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
   ret float %data
@@ -116,7 +116,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_xy:
 ;CHECK: buffer_load_format_xy v[0:1], s[0:3], 0
 ;CHECK: s_waitcnt
-define <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) #0 {
+define amdgpu_shader <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
   ret <2 x float> %data
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc
 ;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc
 ;CHECK: s_waitcnt
-define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
+define amdgpu_shader {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -20,7 +20,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
 ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
+define amdgpu_shader <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret <4 x float> %data
@@ -30,7 +30,7 @@
 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
 ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
+define amdgpu_shader <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
   ret <4 x float> %data
@@ -39,7 +39,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_idx:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
   ret <4 x float> %data
@@ -48,7 +48,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -57,7 +57,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
 main_body:
   %ofs = add i32 %1, 58
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
@@ -67,7 +67,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_both:
 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
   ret <4 x float> %data
@@ -77,7 +77,7 @@
 ;CHECK: v_mov_b32_e32 v2, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
+define amdgpu_shader <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -86,7 +86,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x1:
 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+define amdgpu_shader float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
   ret float %data
@@ -95,7 +95,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x2:
 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+define amdgpu_shader <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
   ret <2 x float> %data
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -5,7 +5,7 @@
 ;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0
 ;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], 0 glc
 ;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], 0 slc
-define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
+define amdgpu_shader void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -15,7 +15,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
 ;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 offset:42
-define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
+define amdgpu_shader void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret void
@@ -23,7 +23,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_idx:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_shader void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   ret void
@@ -31,7 +31,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
-define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_shader void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
   ret void
@@ -39,7 +39,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_both:
 ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
-define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
   ret void
@@ -48,7 +48,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
 ;CHECK: v_mov_b32_e32 v6, v4
 ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
-define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
   ret void
@@ -62,7 +62,7 @@
 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
-define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
@@ -72,7 +72,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_x1:
 ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
-define void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
+define amdgpu_shader void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
@@ -80,7 +80,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_x2:
 ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
-define void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+define amdgpu_shader void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -5,7 +5,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0
 ;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc
 ;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc
-define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
+define amdgpu_shader void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -15,7 +15,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
 ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42
-define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
+define amdgpu_shader void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret void
@@ -23,7 +23,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_idx:
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_shader void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   ret void
@@ -31,7 +31,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_shader void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
   ret void
@@ -39,7 +39,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_both:
 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
   ret void
@@ -48,7 +48,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
 ;CHECK: v_mov_b32_e32 v6, v4
 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
-define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
   ret void
@@ -62,7 +62,7 @@
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
-define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
+define amdgpu_shader void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
@@ -72,7 +72,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_x1:
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
-define void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
+define amdgpu_shader void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
@@ -80,7 +80,7 @@
 
 ;CHECK-LABEL: {{^}}buffer_store_x2:
 ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
-define void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+define amdgpu_shader void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -5,7 +5,7 @@
 ;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00]
 ;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) #0 {
+define amdgpu_shader float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -16,7 +16,7 @@
 ;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00]
 ;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) #0 {
+define amdgpu_shader float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32 %2, <2 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -27,7 +27,7 @@
 ;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00]
 ;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) #0 {
+define amdgpu_shader float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.swap.i32(i32 %2, i32 %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -39,7 +39,7 @@
 ;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: v_mov_b32_e32 v0, v4
-define float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) #0 {
+define amdgpu_shader float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32 %2, i32 %3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -50,7 +50,7 @@
 ;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00]
 ;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) #0 {
+define amdgpu_shader float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.add.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -61,7 +61,7 @@
 ;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
 ;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) #0 {
+define amdgpu_shader float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) #0 {
 main_body:
   %orig = call i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %orig.f = bitcast i32 %orig to float
@@ -87,7 +87,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: image_atomic_dec v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x70,0xf0,0x00,0x04,0x00,0x00]
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) #0 {
+define amdgpu_shader float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) #0 {
 main_body:
   %t0 = call i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
   %t1 = call i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32 %t0, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -4,7 +4,7 @@
 ;CHECK-LABEL: {{^}}image_load_v4i32:
 ;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_shader <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -13,7 +13,7 @@
 ;CHECK-LABEL: {{^}}image_load_v2i32:
 ;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
+define amdgpu_shader <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -22,7 +22,7 @@
 ;CHECK-LABEL: {{^}}image_load_i32:
 ;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
+define amdgpu_shader <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -31,7 +31,7 @@
 ;CHECK-LABEL: {{^}}image_load_mip:
 ;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_shader <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret <4 x float> %tex
@@ -40,7 +40,7 @@
 ;CHECK-LABEL: {{^}}image_load_1:
 ;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
 ;CHECK: s_waitcnt vmcnt(0)
-define float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_shader float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   %elt = extractelement <4 x float> %tex, i32 0
@@ -50,7 +50,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_v4i32:
 ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
+define amdgpu_shader void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
   call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -58,7 +58,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_v2i32:
 ;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
-define void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
+define amdgpu_shader void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
 main_body:
   call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -66,7 +66,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_i32:
 ;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-define void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
+define amdgpu_shader void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
 main_body:
   call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -74,7 +74,7 @@
 
 ;CHECK-LABEL: {{^}}image_store_mip:
 ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
+define amdgpu_shader void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
   call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   ret void
@@ -88,7 +88,7 @@
 ;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) #0 {
+define amdgpu_shader void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
   %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -6,7 +6,7 @@
 ;GCN: s_mov_b32 m0, s{{[0-9]+}}
 ;GCN: v_interp_p1_f32
 ;GCN: v_interp_p2_f32
-define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+define amdgpu_shader void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
 main_body:
   %i = extractelement <2 x i32> %4, i32 0
   %j = extractelement <2 x i32> %4, i32 1
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -6,7 +6,7 @@
 ;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
 
-define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_shader void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
Index: test/CodeGen/AMDGPU/llvm.pow.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.pow.ll
+++ test/CodeGen/AMDGPU/llvm.pow.ll
@@ -5,7 +5,7 @@
 ;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
 
-define void @test1(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test1(<4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.pow.f32( float %r0, float %r1)
@@ -27,7 +27,7 @@
 ;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
-define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
    %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
    call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
Index: test/CodeGen/AMDGPU/load-input-fold.ll
===================================================================
--- test/CodeGen/AMDGPU/load-input-fold.ll
+++ test/CodeGen/AMDGPU/load-input-fold.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/m0-spill.ll
===================================================================
--- test/CodeGen/AMDGPU/m0-spill.ll
+++ test/CodeGen/AMDGPU/m0-spill.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK-NOT: v_readlane_b32 m0
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+define amdgpu_shader void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
   %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
   %cmp = fcmp ueq float 0.0, %4
Index: test/CodeGen/AMDGPU/max-literals.ll
===================================================================
--- test/CodeGen/AMDGPU/max-literals.ll
+++ test/CodeGen/AMDGPU/max-literals.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: ADD *
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -32,7 +32,7 @@
 ; CHECK-LABEL: {{^}}main2:
 ; CHECK-NOT: ADD *
 
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_shader void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/mubuf.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf.ll
+++ test/CodeGen/AMDGPU/mubuf.ll
@@ -55,7 +55,7 @@
 
 ; CHECK-LABEL: {{^}}soffset_max_imm:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_shader void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
 main_body:
   %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
   %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
@@ -74,7 +74,7 @@
 ; CHECK-LABEL: {{^}}soffset_no_fold:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+define amdgpu_shader void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
 main_body:
   %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
   %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
Index: test/CodeGen/AMDGPU/predicate-dp4.ll
===================================================================
--- test/CodeGen/AMDGPU/predicate-dp4.ll
+++ test/CodeGen/AMDGPU/predicate-dp4.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: PRED_SETE_INT * Pred,
 ; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
-define void @main(<4 x float> inreg) #0 {
+define amdgpu_shader void @main(<4 x float> inreg) #0 {
 main_body:
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
Index: test/CodeGen/AMDGPU/pv-packing.ll
===================================================================
--- test/CodeGen/AMDGPU/pv-packing.ll
+++ test/CodeGen/AMDGPU/pv-packing.ll
@@ -3,7 +3,7 @@
 ;CHECK: DOT4  T{{[0-9]\.X}}
 ;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/pv.ll
===================================================================
--- test/CodeGen/AMDGPU/pv.ll
+++ test/CodeGen/AMDGPU/pv.ll
@@ -3,7 +3,7 @@
 ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
 ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/r600-encoding.ll
===================================================================
--- test/CodeGen/AMDGPU/r600-encoding.ll
+++ test/CodeGen/AMDGPU/r600-encoding.ll
@@ -10,7 +10,7 @@
 ; R600: {{^}}test:
 ; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
 entry:
   %r0 = extractelement <4 x float> %reg0, i32 0
   %r1 = extractelement <4 x float> %reg0, i32 1
Index: test/CodeGen/AMDGPU/r600-export-fix.ll
===================================================================
--- test/CodeGen/AMDGPU/r600-export-fix.ll
+++ test/CodeGen/AMDGPU/r600-export-fix.ll
@@ -10,7 +10,7 @@
 ;CHECK: EXPORT T{{[0-9]}}.0000
 
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
===================================================================
--- test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 
-define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+define amdgpu_shader void @main(<4 x float> inreg, <4 x float> inreg) #0 {
 main_body:
   %2 = extractelement <4 x float> %0, i32 0
   %3 = extractelement <4 x float> %0, i32 1
Index: test/CodeGen/AMDGPU/r600cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/r600cfg.ll
+++ test/CodeGen/AMDGPU/r600cfg.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/reciprocal.ll
===================================================================
--- test/CodeGen/AMDGPU/reciprocal.ll
+++ test/CodeGen/AMDGPU/reciprocal.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x float> inreg %reg0) #0  {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0  {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = fdiv float 1.0, %r0
    %vec = insertelement <4 x float> undef, float %r1, i32 0
Index: test/CodeGen/AMDGPU/ret.ll
===================================================================
--- test/CodeGen/AMDGPU/ret.ll
+++ test/CodeGen/AMDGPU/ret.ll
@@ -11,7 +11,7 @@
 ; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   %x = fadd float %3, 1.0
   %a = insertvalue {float, float} undef, float %x, 0
@@ -28,7 +28,7 @@
 ; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
 }
@@ -46,7 +46,7 @@
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
 attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" }
-define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_shader {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -71,7 +71,7 @@
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_shader float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
   ret float 1.0
 }
 
@@ -85,7 +85,7 @@
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
+define amdgpu_shader {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
   %f = bitcast <2 x i32> %8 to <2 x float>
   %s = insertvalue {float, <2 x float>} undef, float %14, 0
   %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
@@ -105,7 +105,7 @@
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
 attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" }
-define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
+define amdgpu_shader {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -135,7 +135,7 @@
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
 attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" }
-define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
+define amdgpu_shader {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -165,7 +165,7 @@
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
 attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" }
-define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
+define amdgpu_shader {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 {
   %i0 = extractelement <2 x i32> %4, i32 0
   %i1 = extractelement <2 x i32> %4, i32 1
   %i2 = extractelement <2 x i32> %7, i32 0
@@ -187,7 +187,7 @@
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   %x = add i32 %2, 2
   %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
   %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
@@ -203,7 +203,7 @@
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   %x = add i32 %2, 2
   ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
 }
@@ -218,7 +218,7 @@
 ; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   %v = fadd float %3, 1.0
   %s = add i32 %2, 2
@@ -239,7 +239,7 @@
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
-define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
+define amdgpu_shader {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
   ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
 }
Index: test/CodeGen/AMDGPU/ret_jump.ll
===================================================================
--- test/CodeGen/AMDGPU/ret_jump.ll
+++ test/CodeGen/AMDGPU/ret_jump.ll
@@ -12,7 +12,7 @@
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target triple = "amdgcn--"
 
-define <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+define amdgpu_shader <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
 main_body:
   %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
   %p87 = fmul float undef, %p83
Index: test/CodeGen/AMDGPU/rv7x0_count3.ll
===================================================================
--- test/CodeGen/AMDGPU/rv7x0_count3.ll
+++ test/CodeGen/AMDGPU/rv7x0_count3.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
 
-define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
    %1 = extractelement <4 x float> %reg1, i32 0
    %2 = extractelement <4 x float> %reg1, i32 1
    %3 = extractelement <4 x float> %reg1, i32 2
Index: test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
===================================================================
--- test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
===================================================================
--- test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -7,7 +7,7 @@
 
 
 ; SI-LABEL: {{^}}main(
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 2
Index: test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
===================================================================
--- test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
+++ test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
 ;REQUIRES: asserts
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/sgpr-copy.ll
===================================================================
--- test/CodeGen/AMDGPU/sgpr-copy.ll
+++ test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -11,7 +11,7 @@
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-define void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_shader void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -35,7 +35,7 @@
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_shader void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -156,7 +156,7 @@
 
 ; We just want ot make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-define void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_shader void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -227,7 +227,7 @@
 ; CHECK: image_sample
 ; CHECK: exp
 ; CHECK: s_endpgm
-define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_shader void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
   %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -291,7 +291,7 @@
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-define void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_shader void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
   br label %LOOP68
 
@@ -321,7 +321,7 @@
 ; CHECK: image_sample
 ; CHECK: image_sample
 ; CHECK: s_endpgm
-define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_shader void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
   %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2
@@ -365,7 +365,7 @@
 ; Check the the resource descriptor is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_shader void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
@@ -380,7 +380,7 @@
 ; Check the the sampler is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_shader void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
Index: test/CodeGen/AMDGPU/shared-op-cycle.ll
===================================================================
--- test/CodeGen/AMDGPU/shared-op-cycle.ll
+++ test/CodeGen/AMDGPU/shared-op-cycle.ll
@@ -4,7 +4,7 @@
 ; CHECK: MULADD_IEEE *
 ; CHECK-NOT: MULADD_IEEE *
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
    %w0 = extractelement <4 x float> %reg0, i32 3
    %w1 = extractelement <4 x float> %reg1, i32 3
    %w2 = extractelement <4 x float> %reg2, i32 3
@@ -29,4 +29,4 @@
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
\ No newline at end of file
+attributes #1 = { readnone }
Index: test/CodeGen/AMDGPU/si-lod-bias.ll
===================================================================
--- test/CodeGen/AMDGPU/si-lod-bias.ll
+++ test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: {{^}}main:
 ; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
-define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_shader void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
Index: test/CodeGen/AMDGPU/si-scheduler.ll
===================================================================
--- test/CodeGen/AMDGPU/si-scheduler.ll
+++ test/CodeGen/AMDGPU/si-scheduler.ll
@@ -11,7 +11,7 @@
 ; CHECK: s_waitcnt vmcnt(0)
 ; CHECK: exp
 ; CHECK: s_endpgm
-define void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_shader void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 main_body:
   %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
   %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll
===================================================================
--- test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -22,7 +22,7 @@
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
-define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_shader void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
   %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -668,7 +668,7 @@
 
 ; CHECK-LABEL: {{^}}main1:
 ; CHECK: s_endpgm
-define void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_shader void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
   %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
Index: test/CodeGen/AMDGPU/si-spill-cf.ll
===================================================================
--- test/CodeGen/AMDGPU/si-spill-cf.ll
+++ test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -6,7 +6,7 @@
 
 ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
 ; SI-NOT: v_readlane_b32 [[SAVED]]
-define void @main() #1 {
+define amdgpu_shader void @main() #1 {
 main_body:
   %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
   %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -88,7 +88,7 @@
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_shader void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -102,7 +102,7 @@
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_shader void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -118,7 +118,7 @@
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_shader void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -133,7 +133,7 @@
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_shader void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
@@ -148,7 +148,7 @@
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define amdgpu_shader void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
Index: test/CodeGen/AMDGPU/split-smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/split-smrd.ll
+++ test/CodeGen/AMDGPU/split-smrd.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}split_smrd_add_worklist:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 
-define void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+define amdgpu_shader void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
   %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
   %tmp1 = bitcast float %tmp to i32
Index: test/CodeGen/AMDGPU/swizzle-export.ll
===================================================================
--- test/CodeGen/AMDGPU/swizzle-export.ll
+++ test/CodeGen/AMDGPU/swizzle-export.ll
@@ -6,7 +6,7 @@
 ;EG: EXPORT T{{[0-9]+}}.XXWX
 ;EG: EXPORT T{{[0-9]+}}.XXXW
 
-define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
@@ -96,7 +96,7 @@
 ; EG: T{{[0-9]+}}.XY__
 ; EG: T{{[0-9]+}}.ZXY0
 
-define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+define amdgpu_shader void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
   %1 = extractelement <4 x float> %reg1, i32 1
Index: test/CodeGen/AMDGPU/tex-clause-antidep.ll
===================================================================
--- test/CodeGen/AMDGPU/tex-clause-antidep.ll
+++ test/CodeGen/AMDGPU/tex-clause-antidep.ll
@@ -3,7 +3,7 @@
 ;CHECK: TEX
 ;CHECK-NEXT: ALU
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
   %1 = extractelement <4 x float> %reg0, i32 0
   %2 = extractelement <4 x float> %reg0, i32 1
   %3 = extractelement <4 x float> %reg0, i32 2
Index: test/CodeGen/AMDGPU/texture-input-merge.ll
===================================================================
--- test/CodeGen/AMDGPU/texture-input-merge.ll
+++ test/CodeGen/AMDGPU/texture-input-merge.ll
@@ -2,7 +2,7 @@
 
 ;CHECK-NOT: MOV
 
-define void @test(<4 x float> inreg %reg0) #0 {
+define amdgpu_shader void @test(<4 x float> inreg %reg0) #0 {
   %1 = extractelement <4 x float> %reg0, i32 0
   %2 = extractelement <4 x float> %reg0, i32 1
   %3 = extractelement <4 x float> %reg0, i32 2
Index: test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -5,7 +5,7 @@
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK: s_cbranch_execz
 ;CHECK: %loop_body
-define void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) #0 {
+define amdgpu_shader void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) #0 {
 main_body:
   %cc = icmp eq i32 %p, 0
   br i1 %cc, label %out, label %loop_body
Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
===================================================================
--- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -24,7 +24,7 @@
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
 
-define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_shader void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0
   %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
Index: test/CodeGen/AMDGPU/wait.ll
===================================================================
--- test/CodeGen/AMDGPU/wait.ll
+++ test/CodeGen/AMDGPU/wait.ll
@@ -11,7 +11,7 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+define amdgpu_shader void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -45,7 +45,7 @@
 ; ILPMAX: s_waitcnt vmcnt(0)
 ; ILPMAX: s_endpgm
 
-define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
+define amdgpu_shader void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
 byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
 main_body:
   %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -5,7 +5,7 @@
 ;
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK-NOT: s_wqm
-define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+define amdgpu_shader <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
   call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
@@ -20,7 +20,7 @@
 ;CHECK: image_sample
 ;CHECK-NOT: exec
 ;CHECK: _load_dword v0,
-define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+define amdgpu_shader float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
 main_body:
   %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
@@ -40,7 +40,7 @@
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 ;CHECK: store
 ;CHECK-NOT: exec
-define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+define amdgpu_shader <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
@@ -62,7 +62,7 @@
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
-define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 {
+define amdgpu_shader <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 {
 main_body:
   %c.1 = mul i32 %c, %d
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
@@ -88,7 +88,7 @@
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK: image_sample
-define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+define amdgpu_shader float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
@@ -124,7 +124,7 @@
 ;CHECK-NEXT: %ELSE
 ;CHECK: store
 ;CHECK: %END
-define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+define amdgpu_shader float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %ELSE, label %IF
@@ -158,7 +158,7 @@
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmp
-define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+define amdgpu_shader <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
 main_body:
   %idx.1 = extractelement <3 x i32> %idx, i32 0
   %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
@@ -205,7 +205,7 @@
 ;CHECK: load
 ;CHECK: store
 ;CHECK: v_cmp
-define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+define amdgpu_shader float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %tex.1 = extractelement <4 x float> %tex, i32 0
@@ -253,7 +253,7 @@
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: %END
 ;CHECK: image_sample
-define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 {
+define amdgpu_shader <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
@@ -286,7 +286,7 @@
 ;VI: flat_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
-define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 {
+define amdgpu_shader <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
@@ -320,7 +320,7 @@
 ;VI: flat_store_dword
 ;CHECK-NOT: wqm
 ;CHECK: v_cmpx_
-define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 {
+define amdgpu_shader <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)