diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -44,6 +44,8 @@
   //===--- Dwarf Emission Directives -----------------------------------===//
   SupportsDebugInformation = true;
   DwarfRegNumForCFI = true;
+  if (TT.getArch() == Triple::amdgcn)
+    ExceptionsType = ExceptionHandling::DwarfCFI;
 }
 
 bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -254,7 +254,7 @@
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) #0 {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -297,7 +297,7 @@
 ; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) #0 {
 entry:
   br label %loop
 
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -65,7 +65,7 @@
 define amdgpu_kernel void @fadd(
     float addrspace(1)* %r,
     float addrspace(1)* %a,
-    float addrspace(1)* %b) {
+    float addrspace(1)* %b) #0 {
 entry:
   %a.val = load float, float addrspace(1)* %a
   %b.val = load float, float addrspace(1)* %b
@@ -77,7 +77,7 @@
 define amdgpu_kernel void @fsub(
     float addrspace(1)* %r,
     float addrspace(1)* %a,
-    float addrspace(1)* %b) {
+    float addrspace(1)* %b) #0 {
 entry:
   %a.val = load float, float addrspace(1)* %a
   %b.val = load float, float addrspace(1)* %b
@@ -85,3 +85,5 @@
   store float %r.val, float addrspace(1)* %r
   ret void
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -57,5 +57,5 @@
 ; Function Attrs: convergent nounwind readnone
 declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4
 
-attributes #0 = { "no-signed-zeros-fp-math"="true" }
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { convergent nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
 ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -20,7 +20,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
   ret <4 x float> %data
@@ -31,7 +31,7 @@
 ;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
 ;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
   ret <4 x float> %data
@@ -40,7 +40,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_idx:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
   ret <4 x float> %data
@@ -49,7 +49,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -58,7 +58,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
@@ -68,7 +68,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_both:
 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
   ret <4 x float> %data
@@ -78,7 +78,7 @@
 ;CHECK: v_mov_b32_e32 v2, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
@@ -87,7 +87,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x1:
 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
   ret float %data
@@ -96,7 +96,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x2:
 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
   ret <2 x float> %data
@@ -105,7 +105,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_negative_offset:
 ;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
-define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 {
 main_body:
   %ofs.1 = add i32 %ofs, -16
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
@@ -117,7 +117,7 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 {
 entry:
   store float 0.0, float addrspace(3)* %lds
   %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
@@ -131,7 +131,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -156,7 +156,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -179,7 +179,7 @@
 ;CHECK-NEXT: %bb.
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -197,7 +197,7 @@
 ;CHECK-NEXT: %bb.
 ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -214,7 +214,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
@@ -231,7 +231,7 @@
 ;CHECK-NEXT: %bb.
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
@@ -247,7 +247,7 @@
 ;CHECK-NEXT: %bb.
 ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
@@ -263,7 +263,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %val = uitofp i8 %tmp to float
@@ -276,7 +276,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
   %tmp2 = zext i16 %tmp to i32
@@ -290,7 +290,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = sext i8 %tmp to i32
@@ -304,7 +304,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
   %tmp2 = sext i16 %tmp to i32
@@ -317,7 +317,7 @@
 ;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = zext i8 %tmp to i32
@@ -330,7 +330,7 @@
 ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = zext i16 %tmp to i32
@@ -343,7 +343,7 @@
 ;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = sext i8 %tmp to i32
@@ -356,7 +356,7 @@
 ;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = sext i16 %tmp to i32
@@ -370,7 +370,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = zext i8 %tmp to i32
@@ -385,7 +385,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = zext i16 %tmp to i32
@@ -400,7 +400,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = sext i8 %tmp to i32
@@ -415,7 +415,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = sext i16 %tmp to i32
@@ -430,7 +430,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   %tmp2 = zext i8 %tmp to i32
@@ -446,7 +446,7 @@
 ; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
 ; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
-define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) #0 {
   %alloca = alloca i32, addrspace(5)
   %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
 
@@ -458,7 +458,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
 ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
 ; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
-define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) #0 {
   %alloca = alloca i32, addrspace(5)
   %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
 ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
@@ -17,7 +17,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret void
@@ -26,7 +26,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_idx:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   ret void
@@ -35,7 +35,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
   ret void
@@ -44,7 +44,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_both:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
   ret void
@@ -54,7 +54,7 @@
 ;CHECK: v_mov_b32_e32 v6, v4
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
   ret void
@@ -69,7 +69,7 @@
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
@@ -80,7 +80,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x1:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
   ret void
@@ -99,7 +99,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -120,7 +120,7 @@
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -139,7 +139,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
@@ -150,7 +150,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) {
+define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) #0 {
   %a1 = add i32 %a, 28
   %a2 = add i32 %a, 32
   %a3 = add i32 %a, 36
@@ -163,7 +163,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) {
+define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
@@ -174,7 +174,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
@@ -186,7 +186,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
@@ -199,7 +199,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) #0 {
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
   ret void
@@ -208,7 +208,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) {
+define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) #0 {
   call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
@@ -218,7 +218,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2:
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) #0 {
   call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   ret void
@@ -227,7 +227,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3:
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) {
+define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) #0 {
   call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
   call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
   ret void
@@ -237,7 +237,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
+define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -249,7 +249,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16
-define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
+define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}gs_const:
 ; GCN-NOT: v_cmpx
 ; GCN: s_mov_b64 exec, 0
-define amdgpu_gs void @gs_const() {
+define amdgpu_gs void @gs_const() #0 {
   %tmp = icmp ule i32 0, 3
   %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
   %c1 = fcmp oge float %tmp1, 0.0
@@ -23,7 +23,7 @@
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GFX10: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) {
+define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) #0 {
   %tmp0 = fcmp olt float %arg13, 0.000000e+00
   %c1 = fcmp oge float %arg14, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
@@ -36,7 +36,7 @@
 ; GCN-NEXT: %bb.
 ; GCN-NEXT: %bb.
 ; GCN-NEXT: s_endpgm
-define amdgpu_gs void @true() {
+define amdgpu_gs void @true() #0 {
   call void @llvm.amdgcn.kill(i1 true)
   ret void
 }
@@ -44,7 +44,7 @@
 ; GCN-LABEL: {{^}}false:
 ; GCN-NOT: v_cmpx
 ; GCN: s_mov_b64 exec, 0
-define amdgpu_gs void @false() {
+define amdgpu_gs void @false() #0 {
   call void @llvm.amdgcn.kill(i1 false)
   ret void
 }
@@ -54,7 +54,7 @@
 ; GCN: v_cmp_lt_i32
 ; GCN: s_or_b64 s[0:1]
 ; GCN: s_and_b64 exec, exec, s[0:1]
-define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
   %c1 = icmp slt i32 %a, %b
   %c2 = icmp slt i32 %c, %d
   %x = or i1 %c1, %c2
@@ -67,7 +67,7 @@
 ; GCN: v_cmp_lt_i32
 ; GCN: s_xor_b64 s[0:1]
 ; GCN: s_andn2_b64 exec, exec, s[0:1]
-define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
   %c1 = icmp slt i32 %a, %b
   %c2 = icmp slt i32 %c, %d
   %x = xor i1 %c1, %c2
@@ -79,7 +79,7 @@
 ; GCN-LABEL: {{^}}oeq:
 ; GCN: v_cmpx_eq_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @oeq(float %a) {
+define amdgpu_gs void @oeq(float %a) #0 {
   %c1 = fcmp oeq float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -88,7 +88,7 @@
 ; GCN-LABEL: {{^}}ogt:
 ; GCN: v_cmpx_lt_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @ogt(float %a) {
+define amdgpu_gs void @ogt(float %a) #0 {
   %c1 = fcmp ogt float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -97,7 +97,7 @@
 ; GCN-LABEL: {{^}}oge:
 ; GCN: v_cmpx_le_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @oge(float %a) {
+define amdgpu_gs void @oge(float %a) #0 {
   %c1 = fcmp oge float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -106,7 +106,7 @@
 ; GCN-LABEL: {{^}}olt:
 ; GCN: v_cmpx_gt_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @olt(float %a) {
+define amdgpu_gs void @olt(float %a) #0 {
   %c1 = fcmp olt float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -115,7 +115,7 @@
 ; GCN-LABEL: {{^}}ole:
 ; GCN: v_cmpx_ge_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @ole(float %a) {
+define amdgpu_gs void @ole(float %a) #0 {
   %c1 = fcmp ole float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -124,7 +124,7 @@
 ; GCN-LABEL: {{^}}one:
 ; GCN: v_cmpx_lg_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @one(float %a) {
+define amdgpu_gs void @one(float %a) #0 {
   %c1 = fcmp one float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -133,7 +133,7 @@
 ; GCN-LABEL: {{^}}ord:
 ; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
 ; GCN: v_cmp_o_f32
-define amdgpu_gs void @ord(float %a) {
+define amdgpu_gs void @ord(float %a) #0 {
   %c1 = fcmp ord float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -142,7 +142,7 @@
 ; GCN-LABEL: {{^}}uno:
 ; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
 ; GCN: v_cmp_u_f32
-define amdgpu_gs void @uno(float %a) {
+define amdgpu_gs void @uno(float %a) #0 {
   %c1 = fcmp uno float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -151,7 +151,7 @@
 ; GCN-LABEL: {{^}}ueq:
 ; GCN: v_cmpx_nlg_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @ueq(float %a) {
+define amdgpu_gs void @ueq(float %a) #0 {
   %c1 = fcmp ueq float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -160,7 +160,7 @@
 ; GCN-LABEL: {{^}}ugt:
 ; GCN: v_cmpx_nge_f32
 ; GCN-NOT: s_and
-define amdgpu_gs void @ugt(float %a) {
+define amdgpu_gs void @ugt(float %a) #0 {
   %c1 = fcmp ugt float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -170,7 +170,7 @@
 ; SI: v_cmpx_ngt_f32_e32 vcc, -1.0
 ; GFX10: v_cmpx_ngt_f32_e32 -1.0
 ; GCN-NOT: s_and
-define amdgpu_gs void @uge(float %a) {
+define amdgpu_gs void @uge(float %a) #0 {
   %c1 = fcmp uge float %a, -1.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -180,7 +180,7 @@
 ; SI: v_cmpx_nle_f32_e32 vcc, -2.0
 ; GFX10: v_cmpx_nle_f32_e32 -2.0
 ; GCN-NOT: s_and
-define amdgpu_gs void @ult(float %a) {
+define amdgpu_gs void @ult(float %a) #0 {
   %c1 = fcmp ult float %a, -2.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -190,7 +190,7 @@
 ; SI: v_cmpx_nlt_f32_e32 vcc, 2.0
 ; GFX10: v_cmpx_nlt_f32_e32 2.0
 ; GCN-NOT: s_and
-define amdgpu_gs void @ule(float %a) {
+define amdgpu_gs void @ule(float %a) #0 {
   %c1 = fcmp ule float %a, 2.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -200,7 +200,7 @@
 ; SI: v_cmpx_neq_f32_e32 vcc, 0
 ; GFX10: v_cmpx_neq_f32_e32 0
 ; GCN-NOT: s_and
-define amdgpu_gs void @une(float %a) {
+define amdgpu_gs void @une(float %a) #0 {
   %c1 = fcmp une float %a, 0.0
   call void @llvm.amdgcn.kill(i1 %c1)
   ret void
@@ -210,7 +210,7 @@
 ; SI: v_cmpx_ngt_f32_e32 vcc, 1.0
 ; GFX10: v_cmpx_ngt_f32_e32 1.0
 ; GCN-NOT: s_and
-define amdgpu_gs void @neg_olt(float %a) {
+define amdgpu_gs void @neg_olt(float %a) #0 {
   %c1 = fcmp olt float %a, 1.0
   %c2 = xor i1 %c1, 1
   call void @llvm.amdgcn.kill(i1 %c2)
@@ -235,7 +235,7 @@
 ; GCN: v_cmp_neq_f32_e32 vcc, 0
 ; GCN: s_wqm_b64 s[0:1], vcc
 ; GCN: s_and_b64 exec, exec, s[0:1]
-define amdgpu_ps void @wqm(float %a) {
+define amdgpu_ps void @wqm(float %a) #0 {
   %c1 = fcmp une float %a, 0.0
   %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
   call void @llvm.amdgcn.kill(i1 %c2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -7,7 +7,7 @@
 ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc{{$}}
 ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc{{$}}
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0)
   %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1)
@@ -26,7 +26,7 @@
 ;GFX10: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc{{$}}
 ;GFX10: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc dlc{{$}}
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4)
   %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 5)
@@ -40,7 +40,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0)
   ret <4 x float> %data
@@ -50,7 +50,7 @@
 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0)
   ret <4 x float> %data
@@ -59,7 +59,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
@@ -68,7 +68,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0)
@@ -78,7 +78,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x1:
 ;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) {
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
 main_body:
   %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
   ret float %data
@@ -87,7 +87,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x2:
 ;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) {
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
 main_body:
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0)
   ret <2 x float> %data
@@ -97,7 +97,7 @@
 ;PREGFX10: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
 ;GFX10: v_add_nc_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], -16, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
-define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 {
 main_body:
   %ofs.1 = add i32 %ofs, -16
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0)
@@ -109,7 +109,7 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 {
 entry:
   store float 0.0, float addrspace(3)* %lds
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -123,7 +123,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -148,7 +148,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) {
+define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) #0 {
 main_body:
   %a = shl i32 %inp, 6
   %a1 = or i32 %a, 4
@@ -174,7 +174,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -197,7 +197,7 @@
 ;CHECK-NEXT: %bb.
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) {
+define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) #0 {
 main_body:
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -216,7 +216,7 @@
 ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) {
+define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) #0 {
 main_body:
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
@@ -236,7 +236,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
   %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0)
@@ -253,7 +253,7 @@
 ;CHECK-NEXT: %bb.
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
   %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -270,7 +270,7 @@
 ;CHECK: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
 ;CHECK: buffer_load_dword v6, off, s[0:3], 0 slc
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0)
   %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1)
@@ -290,7 +290,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %tmp2 = zext i8 %tmp to i32
@@ -304,7 +304,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %tmp2 = zext i16 %tmp to i32
@@ -318,7 +318,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %tmp2 = sext i8 %tmp to i32
@@ -332,7 +332,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) {
+define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %tmp2 = sext i16 %tmp to i32
@@ -345,7 +345,7 @@
 ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b16 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) #0 {
 main_body:
   %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   store half %val, half addrspace(3)* %ptr
@@ -357,7 +357,7 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) #0 {
 main_body:
   %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   store <2 x half> %val, <2 x half> addrspace(3)* %ptr
@@ -369,7 +369,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) #0 {
 main_body:
   %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   store <4 x half> %val, <4 x half> addrspace(3)* %ptr
@@ -381,7 +381,7 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) #0 {
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
@@ -393,7 +393,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) {
+define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) #0 {
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
@@ -405,7 +405,7 @@
 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 ;CHECK: s_waitcnt
-define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0)
   %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0)
@@ -427,7 +427,7 @@
 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28
 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32
 ;CHECK: s_waitcnt
-define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
+define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) #0 {
 main_body:
   %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8)
   %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
 ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -17,7 +17,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
   ret void
@@ -26,7 +26,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   ret void
@@ -41,7 +41,7 @@
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
@@ -52,7 +52,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x1:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) {
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -71,7 +71,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -91,7 +91,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -114,7 +114,7 @@
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -133,7 +133,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) #0 {
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
@@ -144,7 +144,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) #0 {
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -157,7 +157,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -170,7 +170,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) #0 {
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
   ret void
@@ -181,7 +181,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
 ;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
+define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -194,7 +194,7 @@
 ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
 ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
+define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -207,7 +207,7 @@
 ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
 ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
+define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -220,7 +220,7 @@
 ;CHECK-NOT: v0
 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) {
+define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) #0 {
 main_body:
   %trunc = trunc i32 %v1 to i16
   %cast = bitcast i16 %trunc to half
@@ -231,7 +231,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_v2f16:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) {
+define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -251,7 +251,7 @@
 ;CHECK-NOT: v0
 ;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) {
+define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) #0 {
 main_body:
   %trunc = trunc i32 %v1 to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -261,7 +261,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_v2i16:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) {
+define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) #0 {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -280,7 +280,7 @@
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 ;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -297,7 +297,7 @@
 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16
 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28
 ;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32
-define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) #0 {
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -7,7 +7,7 @@
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -23,7 +23,7 @@
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -43,7 +43,7 @@
 ;CHECK: buffer_store_dword
 ;CHECK-NOT; s_wqm_b64 exec, exec
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -65,7 +65,7 @@
 ;CHECK: buffer_store_dword
 ;CHECK; s_wqm_b64 exec, exec
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -87,7 +87,7 @@
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG]]
 ;CHECK-NOT: s_wqm_b64
-define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -109,7 +109,7 @@
 ;CHECK: %IF
 ;CHECK: buffer_load
 ;CHECK: buffer_load
-define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) #1 {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
@@ -145,7 +145,7 @@
 ;CHECK-NOT: s_and_b64 exec
 ;CHECK: buffer_load
 ;CHECK: buffer_load
-define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) #1 {
 main_body:
   %c.bc = bitcast i32 %c to float
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
 ;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -20,7 +20,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
 ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0)
   ret <4 x float> %data
@@ -30,7 +30,7 @@
 ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
 ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0)
   ret <4 x float> %data
@@ -39,7 +39,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_idx:
 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
   ret <4 x float> %data
@@ -48,7 +48,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs:
 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
@@ -57,7 +57,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
 main_body:
   %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -67,7 +67,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_both:
 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
   ret <4 x float> %data
@@ -77,7 +77,7 @@
 ;CHECK: v_mov_b32_e32 v2, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
 main_body:
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
   ret <4 x float> %data
@@ -86,7 +86,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x1:
 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret float %data
@@ -95,7 +95,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_x2:
 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
 ;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   ret <2 x float> %data
@@ -104,7 +104,7 @@
 ;CHECK-LABEL: {{^}}buffer_load_negative_offset:
 ;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
 ;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
-define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) #0 {
 main_body:
   %ofs.1 = add i32 %ofs, -16
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0)
@@ -116,7 +116,7 @@
 ; CHECK-LABEL: buffer_load_mmo:
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
+define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) #0 {
 entry:
   store float 0.0, float addrspace(3)* %lds
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -130,7 +130,7 @@
 ;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
 ;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
 ;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) {
+define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) #0 {
 main_body:
   %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   %data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -150,7 +150,7 @@
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i8 %tmp to i32
@@ -164,7 +164,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = zext i16 %tmp to i32
@@ -178,7 +178,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i8 %tmp to i32
@@ -192,7 +192,7 @@
 ;CHECK-NEXT: s_waitcnt vmcnt(0)
 ;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
 ;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
 main_body:
   %tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
   %tmp2 = sext i16 %tmp to i32
@@ -205,7 +205,7 @@
 ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b16 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store half %val, half addrspace(3)* %ptr
@@ -217,7 +217,7 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x half> %val, <2 x half> addrspace(3)* %ptr
@@ -229,7 +229,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x half> %val, <4 x half> addrspace(3)* %ptr
@@ -241,7 +241,7 @@
 ;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b16 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, i16 addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store i16 %val, i16 addrspace(3)* %ptr
@@ -253,7 +253,7 @@
 ;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b32 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr
@@ -265,7 +265,7 @@
 ;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: ds_write_b64 v0, [[VAL]]
-define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) {
+define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr, i32 %idx) #0 {
 main_body:
   %val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
   store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -6,7 +6,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
 ;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
 ;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -17,7 +17,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
   ret void
@@ -26,7 +26,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_idx:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
@@ -35,7 +35,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
@@ -44,7 +44,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_both:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
@@ -54,7 +54,7 @@
 ;CHECK: v_mov_b32_e32 v6, v4
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
@@ -69,7 +69,7 @@
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
@@ -80,7 +80,7 @@
 ;CHECK-LABEL: {{^}}buffer_store_x1:
 ;CHECK-NOT: s_waitcnt
 ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
@@ -100,7 +100,7 @@
 ;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
 ;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
 ;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
-define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
+define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) #0 {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -113,7 +113,7 @@
 ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
 ;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -126,7 +126,7 @@
 ;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}}
 ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 {
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
@@ -135,7 +135,7 @@
 ;CHECK-LABEL: {{^}}struct_buffer_store_v2f16:
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
-define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) #0 {
   call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
@@ -143,7 +143,7 @@
 ;CHECK-LABEL: {{^}}struct_buffer_store_v4f16:
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
-define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) #0 {
   call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
@@ -153,7 +153,7 @@
 ;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
 ;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
 ;CHECK-NEXT: s_endpgm
-define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) #0 {
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -164,7 +164,7 @@
 ;CHECK-LABEL: {{^}}struct_buffer_store_vif16:
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
-define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) #0 {
   call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
@@ -172,7 +172,7 @@
 ;CHECK-LABEL: {{^}}struct_buffer_store_v4i16:
 ;CHECK-NEXT: %bb.
 ;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
-define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) {
+define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) #0 {
   call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
--- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
@@ -6,6 +6,8 @@
 ; GCN:       .Lfunc_begin0:
 ; GCN-NEXT:    .file 0
 ; GCN-NEXT:    .loc 0 3 0 ; /tmp/dbg.cl:3:0
+; GCN-NEXT:    .cfi_sections .debug_frame
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp0:
@@ -24,6 +26,7 @@
 ; GCN-LABEL: split_v4f32_multi_arg:
 ; GCN:       .Lfunc_begin1:
 ; GCN-NEXT:    .loc 0 7 0 ; /tmp/dbg.cl:7:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp2:
@@ -56,6 +59,7 @@
 ; GCN-LABEL: split_v4f16_arg:
 ; GCN:       .Lfunc_begin2:
 ; GCN-NEXT:    .loc 0 11 0 is_stmt 1 ; /tmp/dbg.cl:11:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp8:
@@ -72,6 +76,7 @@
 ; GCN-LABEL: split_f64_arg:
 ; GCN:       .Lfunc_begin3:
 ; GCN-NEXT:    .loc 0 15 0 ; /tmp/dbg.cl:15:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp10:
@@ -88,6 +93,7 @@
 ; GCN-LABEL: split_v2f64_arg:
 ; GCN:       .Lfunc_begin4:
 ; GCN-NEXT:    .loc 0 19 0 ; /tmp/dbg.cl:19:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp12:
@@ -106,6 +112,7 @@
 ; GCN-LABEL: split_i64_arg:
 ; GCN:       .Lfunc_begin5:
 ; GCN-NEXT:    .loc 0 23 0 ; /tmp/dbg.cl:23:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp14:
@@ -122,6 +129,7 @@
 ; GCN-LABEL: split_ptr_arg:
 ; GCN:       .Lfunc_begin6:
 ; GCN-NEXT:    .loc 0 27 0 ; /tmp/dbg.cl:27:0
+; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp16:
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -5,7 +5,7 @@
 ;
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK-NOT: s_wqm
-define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
+define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) #1 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
@@ -47,7 +47,7 @@
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) #1 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
@@ -98,7 +98,7 @@
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: image_sample
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
+define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #1 {
 main_body:
   %c.1 = mul i32 %c, %d
 
@@ -120,7 +120,7 @@
 ; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
 ; does not happen - the v_add should write the return reg directly.
 ;CHECK-NOT: v_mov_b32_e32
-define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -136,7 +136,7 @@
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -154,7 +154,7 @@
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -170,7 +170,7 @@
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_{{[iu]}}32_e32
-define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
@@ -192,7 +192,7 @@
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG]]
 ;CHECK: v_add_f32_e32
-define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
+define amdgpu_ps float @test_wwm3(i32 inreg %idx) #1 {
 main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
@@ -221,7 +221,7 @@
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG]]
 ;CHECK-NEXT: v_mov_b32_e32
-define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
+define amdgpu_ps float @test_wwm4(i32 inreg %idx) #1 {
 main_body:
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
@@ -250,7 +250,7 @@
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG]]
 ;CHECK: s_wqm_b64 exec, exec
-define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
@@ -276,7 +276,7 @@
 ;VI-CHECK: flat_load_dword
 ;CHECK: v_add_f32_e32
 ;CHECK: s_mov_b64 exec, [[ORIG2]]
-define amdgpu_ps float @test_wwm6_then() {
+define amdgpu_ps float @test_wwm6_then() #1 {
 main_body:
   %src0 = load volatile float, float addrspace(1)* undef
   ; use mbcnt to make sure the branch is divergent
@@ -309,7 +309,7 @@
 ;SI-CHECK: buffer_load_dword
 ;VI-CHECK: flat_load_dword
 ;CHECK: s_mov_b64 exec, [[ORIG2]]
-define amdgpu_ps float @test_wwm6_loop() {
+define amdgpu_ps float @test_wwm6_loop() #1 {
 main_body:
   %src0 = load volatile float, float addrspace(1)* undef
   ; use mbcnt to make sure the branch is divergent
@@ -339,7 +339,7 @@
 ;CHECK: s_not_b64 exec, exec
 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
 ;CHECK: v_add_{{[iu]}}32_e32
-define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
+define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) #1 {
 main_body:
   %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
   %src.0 = bitcast float %src to i32
@@ -357,7 +357,7 @@
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
-define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
+define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) #1 {
 main_body:
   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
   %src1.0 = bitcast float %src1 to i32
@@ -389,7 +389,7 @@
 ;CHECK: %IF
 ;CHECK: image_sample
 ;CHECK: image_sample
-define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
@@ -432,7 +432,7 @@
 ;CHECK: s_or_b64 exec, exec,
 ;CHECK: v_mov_b32_e32 v0
 ;CHECK: ; return
-define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 {
 main_body:
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %ELSE, label %IF
@@ -468,7 +468,7 @@
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmp
-define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) #1 {
 main_body:
   %idx.1 = extractelement <3 x i32> %idx, i32 0
   %data.1 = extractelement <2 x float> %data, i32 0
@@ -512,7 +512,7 @@
 ;CHECK: image_sample
 ;CHECK-DAG: v_cmp
 ;CHECK-DAG: store
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) #1 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %tex0 = extractelement <4 x float> %tex, i32 0
@@ -550,7 +550,7 @@
 ;CHECK: %END
 ;CHECK: image_sample
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) #1 {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
@@ -582,7 +582,7 @@
 ;CHECK: buffer_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) #1 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %idx.0 = extractelement <2 x i32> %idx, i32 0
@@ -615,7 +615,7 @@
 ; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) #1 {
 main_body:
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %tex0 = extractelement <4 x float> %tex, i32 0
@@ -803,7 +803,7 @@
 ;CHECK: s_or_saveexec_b64 {{.*}}, -1
 ;CHECK: ds_swizzle
 ;
-define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) #1 {
 main_body:
   %c.bc = bitcast i32 %c to float
   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
diff --git a/llvm/test/DebugInfo/AMDGPU/cfi.ll b/llvm/test/DebugInfo/AMDGPU/cfi.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/cfi.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - %s | llvm-dwarfdump -debug-frame - | FileCheck %s
+
+; CHECK: .debug_frame contents:
+; CHECK: 00000000 0000000c ffffffff CIE
+; CHECK-NEXT:   Version:               4
+; CHECK-NEXT:   Augmentation:          ""
+; CHECK-NEXT:   Address size:          8
+; CHECK-NEXT:   Segment desc size:     0
+; CHECK-NEXT:   Code alignment factor: 4
+; CHECK-NEXT:   Data alignment factor: 4
+; CHECK-NEXT:   Return address column: 16
+; CHECK-EMPTY:
+; CHECK-NEXT:   DW_CFA_nop:
+; CHECK-EMPTY:
+; CHECK-NEXT: 00000010 {{[0-9]+}} 00000000 FDE cie=00000000 pc=00000000...{{[0-9]+}}
+; CHECK: .eh_frame contents:
+
+define void @func() #0 {
+  ret void
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug)
+!3 = !DIFile(filename: "file", directory: "dir")
diff --git a/llvm/test/MC/ELF/AMDGPU/cfi.s b/llvm/test/MC/ELF/AMDGPU/cfi.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/ELF/AMDGPU/cfi.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -filetype=asm -mcpu=gfx900 -triple amdgcn-amd-amdhsa %s -o - | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -filetype=obj -mcpu=gfx900 -triple amdgcn-amd-amdhsa %s -o - | llvm-readobj -S --sr --sd | FileCheck --check-prefix=READOBJ %s
+
+f:
+	.cfi_sections .debug_frame
+	.cfi_startproc
+	s_nop 0
+	.cfi_endproc
+
+// ASM: f:
+// ASM-NEXT: .cfi_sections .debug_frame
+// FIXME Why emit an extra empty line?
+// ASM-EMPTY:
+// ASM-NEXT: .cfi_startproc
+// ASM-NEXT: s_nop 0
+// FIXME Why emit an extra empty line?
+// ASM-EMPTY:
+// ASM-NEXT: .cfi_endproc
+
+// READOBJ:        Section {
+// READOBJ:          Name: .debug_frame
+// READOBJ-NEXT:     Type: SHT_PROGBITS
+// READOBJ-NEXT:     Flags [
+// READOBJ-NEXT:     ]
+// READOBJ-NEXT:     Address: 0x0
+// READOBJ-NEXT:     Offset: 0x48
+// READOBJ-NEXT:     Size: 40
+// READOBJ-NEXT:     Link: 0
+// READOBJ-NEXT:     Info: 0
+// READOBJ-NEXT:     AddressAlignment: 8
+// READOBJ-NEXT:     EntrySize: 0
+// READOBJ-NEXT:     Relocations [
+// READOBJ-NEXT:     ]
+// READOBJ-NEXT:     SectionData (
+// READOBJ-NEXT:       0000: 0C000000 FFFFFFFF 04000800 04041000
+// READOBJ-NEXT:       0010: 14000000 00000000 00000000 00000000
+// READOBJ-NEXT:       0020: 04000000 00000000
+// READOBJ-NEXT:     )
+// READOBJ-NEXT:   }
+
+// READOBJ:        Section {
+// READOBJ:          Name: .rela.debug_frame
+// READOBJ-NEXT:     Type: SHT_RELA
+// READOBJ-NEXT:     Flags [
+// READOBJ-NEXT:     ]
+// READOBJ-NEXT:     Address: 0x0
+// READOBJ-NEXT:     Offset:
+// READOBJ-NEXT:     Size: 48
+// READOBJ-NEXT:     Link:
+// READOBJ-NEXT:     Info:
+// READOBJ-NEXT:     AddressAlignment: 8
+// READOBJ-NEXT:     EntrySize: 24
+// READOBJ-NEXT:     Relocations [
+// READOBJ-NEXT:       0x14 R_AMDGPU_ABS32 .debug_frame 0x0
+// READOBJ-NEXT:       0x18 R_AMDGPU_ABS64 .text 0x0
+// READOBJ-NEXT:     ]
+// READOBJ:        }
diff --git a/llvm/test/MC/ELF/AMDGPU/lit.local.cfg b/llvm/test/MC/ELF/AMDGPU/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/ELF/AMDGPU/lit.local.cfg
@@ -0,0 +1,3 @@
+# We have to reset config.unsupported here because the parent directory is
+# predicated on 'X86'.
+config.unsupported = not 'AMDGPU' in config.root.targets